提交 6871fe63 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into transfer_gru_unit

...@@ -42,12 +42,6 @@ repos: ...@@ -42,12 +42,6 @@ repos:
entry: bash ./tools/codestyle/pylint_pre_commit.hook entry: bash ./tools/codestyle/pylint_pre_commit.hook
language: system language: system
files: \.(py)$ files: \.(py)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks:
- id: go-fmt
types:
- go
- repo: local - repo: local
hooks: hooks:
- id: copyright_checker - id: copyright_checker
......
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
| qingqing01 | Qing-Qing Dang | | qingqing01 | Qing-Qing Dang |
| reyoung | Yang Yu | | reyoung | Yang Yu |
| Sand3r- | Michal Gallus | | Sand3r- | Michal Gallus |
| sfraczek | Sylwester Fraczek |
| Superjom | Chun-Wei Yan | | Superjom | Chun-Wei Yan |
| tensor-tang | Jian Tang | | tensor-tang | Jian Tang |
| tianbingsz | Tian-Bing Xu | | tianbingsz | Tian-Bing Xu |
...@@ -54,6 +55,7 @@ ...@@ -54,6 +55,7 @@
| wangyang59 | Yang Wang | | wangyang59 | Yang Wang |
| wangzhen-nlp | Zhen Wang | | wangzhen-nlp | Zhen Wang |
| wen-bo-yang | Wen-Bo Yang | | wen-bo-yang | Wen-Bo Yang |
| wojtuss | Wojciech Uss |
| wwhu | Wei-Wei Hu | | wwhu | Wei-Wei Hu |
| xinghai-sun | Xing-Hai Sun | | xinghai-sun | Xing-Hai Sun |
| Xreki | Yi-Qun Liu | | Xreki | Yi-Qun Liu |
......
...@@ -54,23 +54,12 @@ option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF) ...@@ -54,23 +54,12 @@ option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF)
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF)
option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF)
option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF)
option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF)
option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF) option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF)
option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_PSLIB "Compile with pslib support" OFF) option(WITH_PSLIB "Compile with pslib support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF)
...@@ -105,8 +94,6 @@ endif() ...@@ -105,8 +94,6 @@ endif()
if (WIN32) if (WIN32)
set(WITH_DISTRIBUTE OFF CACHE STRING set(WITH_DISTRIBUTE OFF CACHE STRING
"Disable DISTRIBUTE when compiling for Windows" FORCE) "Disable DISTRIBUTE when compiling for Windows" FORCE)
set(WITH_FLUID_ONLY ON CACHE STRING
"Enable FLUID_ONLY when compiling for Windows" FORCE)
endif() endif()
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
...@@ -148,7 +135,6 @@ include(external/openblas) # download, build, install openblas ...@@ -148,7 +135,6 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn include(external/mkldnn) # download, build, install mkldnn
include(external/ngraph) # download, build, install nGraph include(external/ngraph) # download, build, install nGraph
include(external/boost) # download boost include(external/boost) # download boost
include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
include(external/cares) include(external/cares)
...@@ -225,7 +211,6 @@ include(generic) # simplify cmake module ...@@ -225,7 +211,6 @@ include(generic) # simplify cmake module
include(package) # set paddle packages include(package) # set paddle packages
include(ccache) # set ccache for compilation include(ccache) # set ccache for compilation
include(util) # set unittest and link libs include(util) # set unittest and link libs
include(rdma) # set rdma libraries
include(version) # set PADDLE_VERSION include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage include(coveralls) # set code coverage
include(inference_lib) # add paddle fluid inference libraries include(inference_lib) # add paddle fluid inference libraries
...@@ -233,38 +218,11 @@ include(inference_lib) # add paddle fluid inference libraries ...@@ -233,38 +218,11 @@ include(inference_lib) # add paddle fluid inference libraries
include_directories("${PADDLE_SOURCE_DIR}") include_directories("${PADDLE_SOURCE_DIR}")
set(EXTERNAL_LIBS
gflags
glog
${CBLAS_LIBRARIES}
protobuf
zlib
${PYTHON_LIBRARIES}
)
if(WITH_PSLIB)
list(APPEND EXTERNAL_LIBS pslib)
list(APPEND EXTERNAL_LIBS pslib_brpc)
list(APPEND EXTERNAL_LIBS libmct)
endif(WITH_PSLIB)
if(WITH_AMD_GPU) if(WITH_AMD_GPU)
find_package(HIP) find_package(HIP)
include(hip) include(hip)
endif(WITH_AMD_GPU) endif(WITH_AMD_GPU)
if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
if(WITH_LIBXSMM)
list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
endif()
if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif()
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
English | [简体中文](./README_cn.md) English | [简体中文](./README_cn.md)
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
...@@ -18,7 +18,7 @@ learning to many products at Baidu. ...@@ -18,7 +18,7 @@ learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle. Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) ### Latest PaddlePaddle Release: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
### Install Latest Stable Release: ### Install Latest Stable Release:
``` ```
# Linux CPU # Linux CPU
...@@ -26,9 +26,9 @@ pip install paddlepaddle ...@@ -26,9 +26,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7 # Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7 # Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.2.0.post87 pip install paddlepaddle-gpu==1.3.0.post87
# Linux GPU cuda8cudnn5 # Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.2.0.post85 pip install paddlepaddle-gpu==1.3.0.post85
# For installation on other platform, refer to http://paddlepaddle.org/ # For installation on other platform, refer to http://paddlepaddle.org/
``` ```
...@@ -75,26 +75,26 @@ pip install paddlepaddle-gpu==1.2.0.post85 ...@@ -75,26 +75,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
## Installation ## Installation
It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website. It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) on our website.
## Documentation ## Documentation
We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and We provide [English](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) and
[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation. [Chinese](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) documentation.
- [Deep Learning 101](https://github.com/PaddlePaddle/book) - [Deep Learning 101](https://github.com/PaddlePaddle/book)
You might want to start from this online interactive book that can run in a Jupyter Notebook. You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) - [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.3/user_guides/howto/training/multi_node_en.html)
You can run distributed training jobs on MPI clusters. You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) - [Python API](http://paddlepaddle.org/documentation/docs/en/1.3/api/index_en.html)
Our new API enables much shorter programs. Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) - [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.3/advanced_usage/development/contribute_to_paddle/index_en.html)
We appreciate your contributions! We appreciate your contributions!
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
[English](./README.md) | 简体中文 [English](./README.md) | 简体中文
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
...@@ -16,7 +16,7 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效 ...@@ -16,7 +16,7 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效
跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) ### PaddlePaddle最新版本: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
### 安装最新稳定版本: ### 安装最新稳定版本:
``` ```
# Linux CPU # Linux CPU
...@@ -24,9 +24,9 @@ pip install paddlepaddle ...@@ -24,9 +24,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7 # Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7 # Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.2.0.post87 pip install paddlepaddle-gpu==1.3.0.post87
# Linux GPU cuda8cudnn5 # Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.2.0.post85 pip install paddlepaddle-gpu==1.3.0.post85
# 其他平台上的安装指引请参考 http://paddlepaddle.org/ # 其他平台上的安装指引请参考 http://paddlepaddle.org/
``` ```
...@@ -57,26 +57,26 @@ pip install paddlepaddle-gpu==1.2.0.post85 ...@@ -57,26 +57,26 @@ pip install paddlepaddle-gpu==1.2.0.post85
## 安装 ## 安装
推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) 推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/install/index_cn.html)
## 文档 ## 文档
我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) 我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档 [中文](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) 文档
- [深度学习101](https://github.com/PaddlePaddle/book) - [深度学习101](https://github.com/PaddlePaddle/book)
或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行
- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) - [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.3/user_guides/howto/training/multi_node.html)
可以在MPI集群上运行分布式训练任务 可以在MPI集群上运行分布式训练任务
- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) - [Python API](http://paddlepaddle.org/documentation/docs/zh/1.3/api_cn/index_cn.html)
新的API支持代码更少更简洁的程序 新的API支持代码更少更简洁的程序
- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) - [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.3/advanced_usage/development/contribute_to_paddle/index_cn.html)
欢迎您的贡献! 欢迎您的贡献!
......
# Benchmark
Machine:
- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
- Laptop: TBD
System: CentOS release 6.3 (Final), Docker 1.12.1.
PaddlePaddle:
- paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN)
- MKL-DNN tag v0.11
- MKLML 2018.0.1.20171007
- paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS)
- OpenBLAS v0.2.20
On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
## Benchmark Model
### Server
#### Training
Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet.
Input image size - 3 * 224 * 224, Time: images/second
- VGG-19
| BatchSize | 64 | 128 | 256 |
|--------------|-------| -----| --------|
| OpenBLAS | 7.80 | 9.00 | 10.80 |
| MKLML | 12.12 | 13.70 | 16.18 |
| MKL-DNN | 28.46 | 29.83 | 30.44 |
<img src="figs/vgg-cpu-train.png" width="500">
- ResNet-50
| BatchSize | 64 | 128 | 256 |
|--------------|-------| ------| -------|
| OpenBLAS | 25.22 | 25.68 | 27.12 |
| MKLML | 32.52 | 31.89 | 33.12 |
| MKL-DNN | 81.69 | 82.35 | 84.08 |
<img src="figs/resnet-cpu-train.png" width="500">
- GoogLeNet
| BatchSize | 64 | 128 | 256 |
|--------------|-------| ------| -------|
| OpenBLAS | 89.52 | 96.97 | 108.25 |
| MKLML | 128.46| 137.89| 158.63 |
| MKL-DNN     | 250.46| 264.83| 269.50 |
<img src="figs/googlenet-cpu-train.png" width="500">
- AlexNet
| BatchSize | 64 | 128 | 256 |
|--------------|--------| ------ | -------|
| OpenBLAS | 45.62 | 72.79 | 107.22 |
| MKLML | 66.37 | 105.60 | 144.04 |
| MKL-DNN | 399.00 | 498.94 | 626.53 |
<img src="figs/alexnet-cpu-train.png" width="500">
#### Inference
Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
- VGG-19
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|-------|-------|-------|-------|-------|
| OpenBLAS | 1.10 | 1.96 | 3.62 | 3.63 | 2.25 |
| MKLML | 5.58 | 9.80 | 15.15 | 21.21 | 28.67 |
| MKL-DNN | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
<img src="figs/vgg-cpu-infer.png" width="500">
- ResNet-50
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|-------|--------|--------|--------|--------|
| OpenBLAS | 3.31 | 6.72 | 11.59 | 13.17 | 9.27 |
| MKLML | 6.33 | 12.02 | 22.88 | 40.53 | 63.09 |
| MKL-DNN | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
<img src="figs/resnet-cpu-infer.png" width="500">
- GoogLeNet
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|--------|--------|--------|--------|--------|
| OpenBLAS | 12.06 | 23.56 | 34.48 | 36.45 | 23.12 |
| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 |
| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
<img src="figs/googlenet-cpu-infer.png" width="500">
- AlexNet
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|--------|--------|--------|--------|--------|
| OpenBLAS | 3.53 | 6.23 | 15.04 | 26.06 | 31.62 |
| MKLML | 21.32 | 36.55 | 73.06 | 131.15 | 192.77 |
| MKL-DNN | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
<img src="figs/alexnet-cpu-infer.png" width="500">
### Laptop
TBD
# Benchmark
Machine:
- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
- GPU: Tesla K40m
- cuDNN: v5.1
- system: Docker 1.12.1, all platforms are tested in docker environment.
Platforms:
- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0
- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu
- Caffe: kaixhin/cuda-caffe
Several convolutional neural networks and recurrent neural networks are used to test.
## Image
### Benchmark Model
AlexNet, GoogleNet and a small network used in Caffe.
- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark.
- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
### Single-GPU
- AlexNet: input - 3 * 227 * 227, Time: ms/batch
| BatchSize | 64 | 128 | 256 | 512 |
|--------------|-----| -----| ------| -----|
| PaddlePaddle | 195 | 334 | 602 | 1629 |
| TensorFlow | 223 | 364 | 645 | 1235 |
| Caffe | 324 | 627 | 1232 | 2513 |
**Notation**
All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
- GoogletNet: input - 3 * 224 * 224, Time: ms/batch
| BatchSize | 64 | 128 | 256 |
|--------------|-------| -------| --------|
| PaddlePaddle | 613 | 1149 | 2348 |
| TensorFlow | 644 | 1176 | 2219 |
| Caffe | 694 | 1364 | out of memory |
- SmallNet: input - 3 * 32 * 32, Time ms/batch
| BatchSize | 64 | 128 | 256 | 512 |
|--------------|--------| -------- | --------|---------|
| PaddlePaddle | 10.463 | 18.184 | 33.113 | 63.039 |
| TensorFlow | 9 | 15 | 28 | 59 |
| Caffe | 9.373 | 16.6606 | 31.4797 | 59.719 |
**Notation**
All the single-GPU experiments in caffe use `caffe time` to calculate elapsed time, which does not include parameter updating time. However, both PaddlePaddle and TensorFlow experiments contain the parameter updating time. As compared with the total time, this part is relatively little on single machine, we can ignore it.
In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
### Multi-GPU: 4 GPUs
- AlexNet, ms / batch
| total-BatchSize | 128 * 4 | 256 * 4 |
|------------------|----------| -----------|
| PaddlePaddle | 347 | 622 |
| TensorFlow | 377 | 675 |
| Caffe | 1229 | 2435 |
For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by
```
time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512
= (334 * 4)/347
= 3.85
```
<img src="figs/alexnet-4gpu.png" width="420">
- GoogleNet, ms / batch
| total-BatchSize | 128 * 4 | 256 * 4 |
|-------------------|--------------| ----------- |
| PaddlePaddle | 1178 | 2367 |
| TensorFlow | 1210 | 2292 |
| Caffe | 2007 | out of memory |
<img src="figs/googlenet-4gpu.png" width="420">
## RNN
We use lstm network for text classfication to test benchmark.
### Dataset
- [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
- Sequence length is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad. Thus, we also pad sequence length to 100 in PaddlePaddle in order to compare.
- Dictionary size=30000
- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
### Single-GPU
#### LSTM in Text Classification
Testing `2 lstm layer + fc` network with different hidden size and batch size.
- Batch size = 64, ms / batch
| hidden_size | 256 | 512 | 1280 |
|--------------|-------| -------| --------|
| PaddlePaddle | 83 | 184 | 641 |
| TensorFlow | 175 | 280 | 818 |
- Batch size = 128, ms / batch
| hidden_size | 256 | 512 | 1280 |
|--------------|------- | -------| --------|
| PaddlePaddle | 110 | 261 | 1007 |
| TensorFlow | 181 | 361 | 1237 |
- Batch size = 256, ms / batch
| hidden_size | 256 | 512 | 1280 |
|--------------|-------| -------| --------|
| PaddlePaddle | 170 | 414 | 1655 |
| TensorFlow | 238 | 536 | 1905 |
<img src="figs/rnn_lstm_cls.png" width="600">
#### Seq2Seq
The benchmark of sequence-to-sequence network will be added later.
### Multi GPU: 4 GPUs
#### LSTM in Text Classification
- hidden_size = 256, ms / batch
| batch_size | 256 | 512 |
|--------------| -------| --------|
| PaddlePaddle | 90 | 118 |
| TensorFlow | 226 | 118 |
- hidden_size = 512, ms / batch
| batch_size | 256 | 512 |
|--------------| -------| --------|
| PaddlePaddle | 189 | 268 |
| TensorFlow | 297 | 383 |
<img src="figs/rnn_lstm_4gpus.png" width="420">
#### Seq2Seq
The benchmark of sequence-to-sequence network will be added later.
...@@ -15,9 +15,6 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s ...@@ -15,9 +15,6 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
RUN pip install -U pip RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle RUN pip install -U kubernetes paddlepaddle
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
RUN pip uninstall -y paddlepaddle && mkdir /workspace RUN pip uninstall -y paddlepaddle && mkdir /workspace
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
height = 227
width = 227
num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
gp = get_config_arg('layer_num', int, 1)
is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer,
'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
# conv1
net = data_layer('data', size=height * width * 3)
net = img_conv_layer(
input=net,
filter_size=11,
num_channels=3,
num_filters=96,
stride=4,
padding=1)
net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
net = img_pool_layer(input=net, pool_size=3, stride=2)
# conv2
net = img_conv_layer(
input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
net = img_pool_layer(input=net, pool_size=3, stride=2)
# conv3
net = img_conv_layer(
input=net, filter_size=3, num_filters=384, stride=1, padding=1)
# conv4
net = img_conv_layer(
input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
# conv5
net = img_conv_layer(
input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
net = img_pool_layer(input=net, pool_size=3, stride=2)
net = fc_layer(
input=net,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
net = fc_layer(
input=net,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
if is_infer:
outputs(net)
else:
lab = data_layer('label', num_class)
loss = cross_entropy(input=net, label=lab)
outputs(loss)
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 224
width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
use_gpu = get_config_arg('use_gpu', bool, True)
is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer,
'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
conv_projection = conv_projection if use_gpu else img_conv_layer
def inception2(name, input, channels, \
filter1,
filter3R, filter3,
filter5R, filter5,
proj):
conv1 = name + '_1'
conv3r = name + '_3r'
conv3 = name + '_3'
conv5r = name + '_5r'
conv5 = name + '_5'
maxpool = name + '_max'
convproj = name + '_proj'
cov1 = img_conv_layer(
name=conv1,
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter1,
stride=1,
padding=0)
cov3r = img_conv_layer(
name=conv3r,
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter3R,
stride=1,
padding=0)
cov3 = img_conv_layer(
name=conv3,
input=cov3r,
filter_size=3,
num_filters=filter3,
stride=1,
padding=1)
cov5r = img_conv_layer(
name=conv5r,
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter5R,
stride=1,
padding=0)
cov5 = img_conv_layer(
name=conv5,
input=cov5r,
filter_size=5,
num_filters=filter5,
stride=1,
padding=2)
pool1 = img_pool_layer(
name=maxpool,
input=input,
pool_size=3,
num_channels=channels,
stride=1,
padding=1)
covprj = img_conv_layer(
name=convproj,
input=pool1,
filter_size=1,
num_filters=proj,
stride=1,
padding=0)
cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj])
return cat
def inception(name, input, channels, \
filter1,
filter3R, filter3,
filter5R, filter5,
proj):
cov1 = conv_projection(
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter1,
stride=1,
padding=0)
cov3r = img_conv_layer(
name=name + '_3r',
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter3R,
stride=1,
padding=0)
cov3 = conv_projection(
input=cov3r, filter_size=3, num_filters=filter3, stride=1, padding=1)
cov5r = img_conv_layer(
name=name + '_5r',
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter5R,
stride=1,
padding=0)
cov5 = conv_projection(
input=cov5r, filter_size=5, num_filters=filter5, stride=1, padding=2)
pool1 = img_pool_layer(
name=name + '_max',
input=input,
pool_size=3,
num_channels=channels,
stride=1,
padding=1)
covprj = conv_projection(
input=pool1, filter_size=1, num_filters=proj, stride=1, padding=0)
cat = concat_layer(
name=name,
input=[cov1, cov3, cov5, covprj],
bias_attr=True if use_gpu else False,
act=ReluActivation())
return cat
data = data_layer(name="input", size=3 * height * width)
# stage 1
conv1 = img_conv_layer(
name="conv1",
input=data,
filter_size=7,
num_channels=3,
num_filters=64,
stride=2,
padding=3)
pool1 = img_pool_layer(
name="pool1", input=conv1, pool_size=3, num_channels=64, stride=2)
# stage 2
conv2_1 = img_conv_layer(
name="conv2_1",
input=pool1,
filter_size=1,
num_filters=64,
stride=1,
padding=0)
conv2_2 = img_conv_layer(
name="conv2_2",
input=conv2_1,
filter_size=3,
num_filters=192,
stride=1,
padding=1)
pool2 = img_pool_layer(
name="pool2", input=conv2_2, pool_size=3, num_channels=192, stride=2)
# stage 3
ince3a = inception("ince3a", pool2, 192, 64, 96, 128, 16, 32, 32)
ince3b = inception("ince3b", ince3a, 256, 128, 128, 192, 32, 96, 64)
pool3 = img_pool_layer(
name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2)
# stage 4
ince4a = inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64)
ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64)
ince4c = inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64, 64)
ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64)
ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128)
pool4 = img_pool_layer(
name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2)
# stage 5
ince5a = inception("ince5a", pool4, 832, 256, 160, 320, 32, 128, 128)
ince5b = inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128, 128)
pool5 = img_pool_layer(
name="pool5",
input=ince5b,
num_channels=1024,
pool_size=7,
stride=7,
pool_type=AvgPooling())
# We remove loss1 and loss2 for all system when testing benchmark
# output 1
# pool_o1 = img_pool_layer(name="pool_o1", input=ince4a, num_channels=512, pool_size=5, stride=3, pool_type=AvgPooling())
# conv_o1 = img_conv_layer(name="conv_o1", input=pool_o1, filter_size=1, num_filters=128, stride=1, padding=0)
# fc_o1 = fc_layer(name="fc_o1", input=conv_o1, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
# out1 = fc_layer(name="output1", input=fc_o1, size=1000, act=SoftmaxActivation())
# loss1 = cross_entropy(name='loss1', input=out1, label=lab, coeff=0.3)
# output 2
#pool_o2 = img_pool_layer(name="pool_o2", input=ince4d, num_channels=528, pool_size=5, stride=3, pool_type=AvgPooling())
#conv_o2 = img_conv_layer(name="conv_o2", input=pool_o2, filter_size=1, num_filters=128, stride=1, padding=0)
#fc_o2 = fc_layer(name="fc_o2", input=conv_o2, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
#out2 = fc_layer(name="output2", input=fc_o2, size=1000, act=SoftmaxActivation())
#loss2 = cross_entropy(name='loss2', input=out2, label=lab, coeff=0.3)
# output 3
dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
out3 = fc_layer(
name="output3", input=dropout, size=1000, act=SoftmaxActivation())
if is_infer:
outputs(out3)
else:
lab = data_layer(name="label", size=num_class)
loss3 = cross_entropy(name='loss3', input=out3, label=lab)
outputs(loss3)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import argparse
import matplotlib.pyplot as plt
def parse_args():
parser = argparse.ArgumentParser('Parse Log')
parser.add_argument(
'--file_path', '-f', type=str, help='the path of the log file')
parser.add_argument(
'--sample_rate',
'-s',
type=float,
default=1.0,
help='the rate to take samples from log')
parser.add_argument(
'--log_period', '-p', type=int, default=1, help='the period of log')
args = parser.parse_args()
return args
def parse_file(file_name):
loss = []
error = []
with open(file_name) as f:
for i, line in enumerate(f):
line = line.strip()
if not line.startswith('pass'):
continue
line_split = line.split(' ')
if len(line_split) != 5:
continue
loss_str = line_split[2][:-1]
cur_loss = float(loss_str.split('=')[-1])
loss.append(cur_loss)
err_str = line_split[3][:-1]
cur_err = float(err_str.split('=')[-1])
error.append(cur_err)
accuracy = [1.0 - err for err in error]
return loss, accuracy
def sample(metric, sample_rate):
interval = int(1.0 / sample_rate)
if interval > len(metric):
return metric[:1]
num = len(metric) / interval
idx = [interval * i for i in range(num)]
metric_sample = [metric[id] for id in idx]
return metric_sample
def plot_metric(metric,
batch_id,
graph_title,
line_style='b-',
line_label='y',
line_num=1):
plt.figure()
plt.title(graph_title)
if line_num == 1:
plt.plot(batch_id, metric, line_style, label=line_label)
else:
for i in range(line_num):
plt.plot(batch_id, metric[i], line_style[i], label=line_label[i])
plt.xlabel('batch')
plt.ylabel(graph_title)
plt.legend()
plt.savefig(graph_title + '.jpg')
plt.close()
def main():
args = parse_args()
assert args.sample_rate > 0. and args.sample_rate <= 1.0, "The sample rate should in the range (0, 1]."
loss, accuracy = parse_file(args.file_path)
batch = [args.log_period * i for i in range(len(loss))]
batch_sample = sample(batch, args.sample_rate)
loss_sample = sample(loss, args.sample_rate)
accuracy_sample = sample(accuracy, args.sample_rate)
plot_metric(loss_sample, batch_sample, 'loss', line_label='loss')
plot_metric(
accuracy_sample,
batch_sample,
'accuracy',
line_style='g-',
line_label='accuracy')
if __name__ == '__main__':
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io, os
import random
import numpy as np
from paddle.trainer.PyDataProvider2 import *
def initHook(settings, height, width, color, num_class, **kwargs):
settings.height = height
settings.width = width
settings.color = color
settings.num_class = num_class
if settings.color:
settings.data_size = settings.height * settings.width * 3
else:
settings.data_size = settings.height * settings.width
settings.is_infer = kwargs.get('is_infer', False)
settings.num_samples = kwargs.get('num_samples', 2560)
if settings.is_infer:
settings.slots = [dense_vector(settings.data_size)]
else:
settings.slots = [dense_vector(settings.data_size), integer_value(1)]
@provider(
init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_list):
for i in xrange(settings.num_samples):
img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
if settings.is_infer:
yield img.astype('float32')
else:
lab = random.randint(0, settings.num_class - 1)
yield img.astype('float32'), int(lab)
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 224
width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg("layer_num", int, 50)
is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer,
'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
#######################Network Configuration #############
def conv_bn_layer(name,
input,
filter_size,
num_filters,
stride,
padding,
channels=None,
active_type=ReluActivation()):
"""
A wrapper for conv layer with batch normalization layers.
Note:
conv layer has no activation.
"""
tmp = img_conv_layer(
name=name + "_conv",
input=input,
filter_size=filter_size,
num_channels=channels,
num_filters=num_filters,
stride=stride,
padding=padding,
act=LinearActivation(),
bias_attr=False)
return batch_norm_layer(
name=name + "_bn",
input=tmp,
act=active_type,
use_global_stats=is_infer)
def bottleneck_block(name, input, num_filters1, num_filters2):
"""
A wrapper for bottlenect building block in ResNet.
Last conv_bn_layer has no activation.
Addto layer has activation of relu.
"""
last_name = conv_bn_layer(
name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=1,
padding=0)
last_name = conv_bn_layer(
name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(
name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(
name=name + "_addto", input=[input, last_name], act=ReluActivation())
def mid_projection(name, input, num_filters1, num_filters2, stride=2):
"""
A wrapper for middile projection in ResNet.
projection shortcuts are used for increasing dimensions,
and other shortcuts are identity
branch1: projection shortcuts are used for increasing
dimensions, has no activation.
branch2x: bottleneck building block, shortcuts are identity.
"""
# stride = 2
branch1 = conv_bn_layer(
name=name + '_branch1',
input=input,
filter_size=1,
num_filters=num_filters2,
stride=stride,
padding=0,
active_type=LinearActivation())
last_name = conv_bn_layer(
name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=stride,
padding=0)
last_name = conv_bn_layer(
name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(
name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(
name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
img = data_layer(name='image', size=height * width * 3)
def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
"""
A wrapper for 50,101,152 layers of ResNet.
res2_num: number of blocks stacked in conv2_x
res3_num: number of blocks stacked in conv3_x
res4_num: number of blocks stacked in conv4_x
res5_num: number of blocks stacked in conv5_x
"""
# For ImageNet
# conv1: 112x112
tmp = conv_bn_layer(
"conv1",
input=img,
filter_size=7,
channels=3,
num_filters=64,
stride=2,
padding=3)
tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
# conv2_x: 56x56
tmp = mid_projection(
name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
for i in xrange(2, res2_num + 1, 1):
tmp = bottleneck_block(
name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
# conv3_x: 28x28
tmp = mid_projection(
name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
for i in xrange(2, res3_num + 1, 1):
tmp = bottleneck_block(
name="res3_" + str(i),
input=tmp,
num_filters1=128,
num_filters2=512)
# conv4_x: 14x14
tmp = mid_projection(
name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
for i in xrange(2, res4_num + 1, 1):
tmp = bottleneck_block(
name="res4_" + str(i),
input=tmp,
num_filters1=256,
num_filters2=1024)
# conv5_x: 7x7
tmp = mid_projection(
name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
for i in xrange(2, res5_num + 1, 1):
tmp = bottleneck_block(
name="res5_" + str(i),
input=tmp,
num_filters1=512,
num_filters2=2048)
tmp = img_pool_layer(
name='avgpool',
input=tmp,
pool_size=7,
stride=1,
pool_type=AvgPooling())
return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
if layer_num == 50:
resnet = deep_res_net(3, 4, 6, 3)
elif layer_num == 101:
resnet = deep_res_net(3, 4, 23, 3)
elif layer_num == 152:
resnet = deep_res_net(3, 8, 36, 3)
else:
print("Wrong layer number.")
if is_infer:
outputs(resnet)
else:
lbl = data_layer(name="label", size=num_class)
loss = cross_entropy(name='loss', input=resnet, label=lbl)
outputs(loss)
#!/bin/bash
set -e
function train() {
cfg=$1
thread=$2
bz=$3
args="batch_size=$3"
prefix=$4
paddle train --job=time \
--config=$cfg \
--use_gpu=True \
--trainer_count=$thread \
--log_period=10 \
--test_period=100 \
--config_args=$args \
> logs/$prefix-${thread}gpu-$bz.log 2>&1
}
if [ ! -d "train.list" ]; then
echo " " > train.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
#========single-gpu=========#
# alexnet
train alexnet.py 1 64 alexnet
train alexnet.py 1 128 alexnet
train alexnet.py 1 256 alexnet
train alexnet.py 1 512 alexnet
# googlenet
train googlenet.py 1 64 googlenet
train googlenet.py 1 128 googlenet
train googlenet.py 1 256 googlenet
# smallnet
train smallnet_mnist_cifar.py 1 64 smallnet
train smallnet_mnist_cifar.py 1 128 smallnet
train smallnet_mnist_cifar.py 1 256 smallnet
train smallnet_mnist_cifar.py 1 512 smallnet
############################
#========multi-gpus=========#
train alexnet.py 4 512 alexnet
train alexnet.py 4 1024 alexnet
train googlenet.py 4 512 googlenet
train googlenet.py 4 1024 googlenet
#!/bin/bash
set -e
function clock_to_seconds() {
hours=`echo $1 | awk -F ':' '{print $1}'`
mins=`echo $1 | awk -F ':' '{print $2}'`
secs=`echo $1 | awk -F ':' '{print $3}'`
echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
}
function infer() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
topology=$1
layer_num=$2
bs=$3
use_mkldnn=$4
if [ $4 == "True" ]; then
thread=1
log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
elif [ $4 == "False" ]; then
thread=`nproc`
if [ $thread -gt $bs ]; then
thread=$bs
fi
log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
else
echo "Wrong input $4, use True or False."
exit 0
fi
models_in="models/${topology}-${layer_num}/pass-00000/"
if [ ! -d $models_in ]; then
echo "Training model ${topology}_${layer_num}"
paddle train --job=train \
--config="${topology}.py" \
--use_mkldnn=True \
--use_gpu=False \
--trainer_count=1 \
--num_passes=1 \
--save_dir="models/${topology}-${layer_num}" \
--config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
> /dev/null 2>&1
echo "Done"
fi
log_period=$((256 / bs))
paddle train --job=test \
--config="${topology}.py" \
--use_mkldnn=$use_mkldnn \
--use_gpu=False \
--trainer_count=$thread \
--log_period=$log_period \
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
--init_model_path=$models_in \
2>&1 | tee ${log}
# calculate the last 5 logs period time of 1280 samples,
# the time before are burning time.
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
start_sec=`clock_to_seconds $start`
end_sec=`clock_to_seconds $end`
fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
echo " " > train.list
fi
if [ ! -f "test.list" ]; then
echo " " > test.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
if [ ! -d "models" ]; then
mkdir -p models
fi
# inference benchmark
for use_mkldnn in True False; do
for batchsize in 1 2 4 8 16; do
infer vgg 19 $batchsize $use_mkldnn
infer resnet 50 $batchsize $use_mkldnn
infer googlenet v1 $batchsize $use_mkldnn
infer alexnet 2 $batchsize $use_mkldnn
done
done
#!/bin/bash
set -e
function train() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
topology=$1
layer_num=$2
bs=$3
use_mkldnn=$4
if [ $4 == "True" ]; then
thread=1
log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
elif [ $4 == "False" ]; then
thread=`nproc`
# each trainer_count use only 1 core to avoid conflict
log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
else
echo "Wrong input $4, use True or False."
exit 0
fi
args="batch_size=${bs},layer_num=${layer_num}"
config="${topology}.py"
paddle train --job=time \
--config=$config \
--use_mkldnn=$use_mkldnn \
--use_gpu=False \
--trainer_count=$thread \
--log_period=10 \
--test_period=100 \
--config_args=$args \
2>&1 | tee ${log}
avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
echo " " > train.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
# training benchmark
for use_mkldnn in True False; do
for batchsize in 64 128 256; do
train vgg 19 $batchsize $use_mkldnn
train resnet 50 $batchsize $use_mkldnn
train googlenet v1 $batchsize $use_mkldnn
train alexnet 2 $batchsize $use_mkldnn
done
done
#!/bin/bash
set -e
function clock_to_seconds() {
hours=`echo $1 | awk -F ':' '{print $1}'`
mins=`echo $1 | awk -F ':' '{print $2}'`
secs=`echo $1 | awk -F ':' '{print $3}'`
echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
}
function infer() {
export OPENBLAS_MAIN_FREE=1
topology=$1
layer_num=$2
bs=$3
trainers=`nproc`
if [ $trainers -gt $bs ]; then
trainers=$bs
fi
log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
threads=$((`nproc` / trainers))
if [ $threads -eq 0 ]; then
threads=1
fi
export OPENBLAS_NUM_THREADS=$threads
models_in="models/${topology}-${layer_num}/pass-00000/"
if [ ! -d $models_in ]; then
echo "./run_mkl_infer.sh to save the model first"
exit 0
fi
log_period=$((32 / bs))
paddle train --job=test \
--config="${topology}.py" \
--use_mkldnn=False \
--use_gpu=False \
--trainer_count=$trainers \
--log_period=$log_period \
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
--init_model_path=$models_in \
2>&1 | tee ${log}
# calculate the last 5 logs period time of 160(=32*5) samples,
# the time before are burning time.
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
start_sec=`clock_to_seconds $start`
end_sec=`clock_to_seconds $end`
fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
echo " " > train.list
fi
if [ ! -f "test.list" ]; then
echo " " > test.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
# inference benchmark
for batchsize in 1 2 4 8 16; do
infer vgg 19 $batchsize
infer resnet 50 $batchsize
infer googlenet v1 $batchsize
infer alexnet 2 $batchsize
done
#!/bin/bash
set -e
function train() {
export OPENBLAS_NUM_THREADS=1
topology=$1
layer_num=$2
bs=$3
thread=`nproc`
# each trainer_count use only 1 core to avoid conflict
log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
args="batch_size=${bs},layer_num=${layer_num}"
config="${topology}.py"
paddle train --job=time \
--config=$config \
--use_mkldnn=False \
--use_gpu=False \
--trainer_count=$thread \
--log_period=3 \
--test_period=30 \
--config_args=$args \
2>&1 | tee ${log}
avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
echo " " > train.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
# training benchmark
for batchsize in 64 128 256; do
train vgg 19 $batchsize
train resnet 50 $batchsize
train googlenet v1 $batchsize
train alexnet 2 $batchsize
done
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 32
width = 32
num_class = 10
batch_size = get_config_arg('batch_size', int, 128)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
# conv1
net = data_layer('data', size=height * width * 3)
net = img_conv_layer(
input=net,
filter_size=5,
num_channels=3,
num_filters=32,
stride=1,
padding=2)
net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1)
# conv2
net = img_conv_layer(
input=net, filter_size=5, num_filters=32, stride=1, padding=2)
net = img_pool_layer(
input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
# conv3
net = img_conv_layer(
input=net, filter_size=3, num_filters=64, stride=1, padding=1)
net = img_pool_layer(
input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
net = fc_layer(input=net, size=64, act=ReluActivation())
net = fc_layer(input=net, size=10, act=SoftmaxActivation())
lab = data_layer('label', num_class)
loss = classification_cost(input=net, label=lab)
outputs(loss)
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 224
width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg('layer_num', int, 19)
is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer,
'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings(
batch_size=batch_size,
learning_rate=0.001 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
img = data_layer(name='image', size=height * width * 3)
def vgg_network(vgg_num=3):
tmp = img_conv_group(
input=img,
num_channels=3,
conv_padding=1,
conv_num_filter=[64, 64],
conv_filter_size=3,
conv_act=ReluActivation(),
pool_size=2,
pool_stride=2,
pool_type=MaxPooling())
tmp = img_conv_group(
input=tmp,
conv_num_filter=[128, 128],
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
channels = []
for i in range(vgg_num):
channels.append(256)
tmp = img_conv_group(
input=tmp,
conv_num_filter=channels,
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
channels = []
for i in range(vgg_num):
channels.append(512)
tmp = img_conv_group(
input=tmp,
conv_num_filter=channels,
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
tmp = img_conv_group(
input=tmp,
conv_num_filter=channels,
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
tmp = fc_layer(
input=tmp,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
tmp = fc_layer(
input=tmp,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
if layer_num == 16:
vgg = vgg_network(3)
elif layer_num == 19:
vgg = vgg_network(4)
else:
print("Wrong layer number.")
if is_infer:
outputs(vgg)
else:
lab = data_layer('label', num_class)
loss = cross_entropy(input=vgg, label=lab)
outputs(loss)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io, os
import random
import numpy as np
import six.moves.cPickle as pickle
from paddle.trainer.PyDataProvider2 import *
def remove_unk(x, n_words):
return [[1 if w >= n_words else w for w in sen] for sen in x]
# ==============================================================
# tensorflow uses fixed length, but PaddlePaddle can process
# variable-length. Padding is used in benchmark in order to
# compare with other platform.
# ==============================================================
def pad_sequences(sequences,
maxlen=None,
dtype='int32',
padding='post',
truncating='post',
value=0.):
lengths = [len(s) for s in sequences]
nb_samples = len(sequences)
if maxlen is None:
maxlen = np.max(lengths)
x = (np.ones((nb_samples, maxlen)) * value).astype(dtype)
for idx, s in enumerate(sequences):
if len(s) == 0:
continue # empty list was found
if truncating == 'pre':
trunc = s[-maxlen:]
elif truncating == 'post':
trunc = s[:maxlen]
else:
raise ValueError("Truncating type '%s' not understood" % padding)
if padding == 'post':
x[idx, :len(trunc)] = trunc
elif padding == 'pre':
x[idx, -len(trunc):] = trunc
else:
raise ValueError("Padding type '%s' not understood" % padding)
return x
def initHook(settings, vocab_size, pad_seq, maxlen, **kwargs):
settings.vocab_size = vocab_size
settings.pad_seq = pad_seq
settings.maxlen = maxlen
settings.input_types = [
integer_value_sequence(vocab_size), integer_value(2)
]
@provider(
init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file):
f = open(file, 'rb')
train_set = pickle.load(f)
f.close()
x, y = train_set
# remove unk, namely remove the words out of dictionary
x = remove_unk(x, settings.vocab_size)
if settings.pad_seq:
x = pad_sequences(x, maxlen=settings.maxlen, value=0.)
for i in range(len(y)):
yield map(int, x[i]), int(y[i])
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
import imdb
num_class = 2
vocab_size = 30000
fixedlen = 100
batch_size = get_config_arg('batch_size', int, 128)
lstm_num = get_config_arg('lstm_num', int, 1)
hidden_size = get_config_arg('hidden_size', int, 128)
# whether to pad sequence into fixed length
pad_seq = get_config_arg('pad_seq', bool, True)
imdb.create_data('imdb.pkl')
args = {'vocab_size': vocab_size, 'pad_seq': pad_seq, 'maxlen': fixedlen}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
net = data_layer('data', size=vocab_size)
net = embedding_layer(input=net, size=128)
for i in xrange(lstm_num):
net = simple_lstm(input=net, size=hidden_size)
net = last_seq(input=net)
net = fc_layer(input=net, size=2, act=SoftmaxActivation())
lab = data_layer('label', num_class)
loss = classification_cost(input=net, label=lab)
outputs(loss)
#!/bin/bash
set -e
function train() {
cfg=$1
thread=$2
args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}"
paddle train --job=time \
--config=$cfg \
--use_gpu=1 \
--trainer_count=$thread \
--log_period=10 \
--test_period=100 \
--num_passes=1 \
--feed_data=1 \
--config_args=$args \
>logs/rnn-pad${4}-${thread}gpu-lstm${3}-batch${6}-hid${5}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
## padding, single gpu
#-----config--gpu--lstm_num--padding--hidden_size--batch_size
## lstm_num=2, batch_size=64
train rnn.py 1 2 1 256 64
train rnn.py 1 2 1 512 64
train rnn.py 1 2 1 1280 64
## lstm_num=2, batch_size=128
train rnn.py 1 2 1 256 128
train rnn.py 1 2 1 512 128
train rnn.py 1 2 1 1280 128
## lstm_num=4, batch_size=256
train rnn.py 1 2 1 256 256
train rnn.py 1 2 1 512 256
train rnn.py 1 2 1 1280 256
#==================multi gpus=====================#
# hidden_size=256, lstm_num=2, different batch size
train rnn.py 4 2 1 256 128
train rnn.py 4 2 1 256 256
train rnn.py 4 2 1 256 512
# hidden_size=512, lstm_num=4, different batch size
train rnn.py 4 2 1 512 128
train rnn.py 4 2 1 512 256
train rnn.py 4 2 1 512 512
...@@ -35,8 +35,6 @@ import os ...@@ -35,8 +35,6 @@ import os
import argparse import argparse
import time import time
import paddle.v2 as paddle
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( parser.add_argument(
"--embedding_dim", "--embedding_dim",
......
...@@ -21,7 +21,6 @@ import time ...@@ -21,7 +21,6 @@ import time
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
import paddle.v2 as paddle
DTYPE = tf.float32 DTYPE = tf.float32
......
...@@ -27,7 +27,6 @@ import argparse ...@@ -27,7 +27,6 @@ import argparse
import time import time
import numpy as np import numpy as np
import paddle.v2 as paddle
import tensorflow as tf import tensorflow as tf
DTYPE = tf.float32 DTYPE = tf.float32
......
...@@ -21,8 +21,6 @@ import argparse ...@@ -21,8 +21,6 @@ import argparse
import time import time
import tensorflow as tf import tensorflow as tf
import paddle.v2 as paddle
def parse_args(): def parse_args():
parser = argparse.ArgumentParser("LSTM model benchmark.") parser = argparse.ArgumentParser("LSTM model benchmark.")
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
# limitations under the License. # limitations under the License.
"""VGG16 benchmark in TensorFlow""" """VGG16 benchmark in TensorFlow"""
import tensorflow as tf import tensorflow as tf
import paddle.v2 as paddle
import numpy as np import numpy as np
import argparse import argparse
import time import time
......
...@@ -20,31 +20,10 @@ if(WITH_DSO) ...@@ -20,31 +20,10 @@ if(WITH_DSO)
add_definitions(-DPADDLE_USE_DSO) add_definitions(-DPADDLE_USE_DSO)
endif(WITH_DSO) endif(WITH_DSO)
if(WITH_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE)
endif(WITH_DOUBLE)
if(WITH_ARM_FP16)
add_definitions(-DPADDLE_ARM_FP16)
add_definitions("-march=armv8.2-a+fp16+simd")
endif(WITH_ARM_FP16)
if(WITH_TESTING) if(WITH_TESTING)
add_definitions(-DPADDLE_WITH_TESTING) add_definitions(-DPADDLE_WITH_TESTING)
endif(WITH_TESTING) endif(WITH_TESTING)
if(NOT WITH_TIMER)
add_definitions(-DPADDLE_DISABLE_TIMER)
endif(NOT WITH_TIMER)
if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS)
if(EIGEN_USE_THREADS)
add_definitions(-DEIGEN_USE_THREADS)
endif(EIGEN_USE_THREADS)
if(NOT WITH_PROFILER) if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER) endif(NOT WITH_PROFILER)
...@@ -78,10 +57,6 @@ if(WIN32) ...@@ -78,10 +57,6 @@ if(WIN32)
endif(NOT MSVC) endif(NOT MSVC)
endif(WIN32) endif(WIN32)
if(NOT WITH_GOLANG)
add_definitions(-DPADDLE_WITHOUT_GOLANG)
endif(NOT WITH_GOLANG)
if(WITH_PSLIB) if(WITH_PSLIB)
add_definitions(-DPADDLE_WITH_PSLIB) add_definitions(-DPADDLE_WITH_PSLIB)
endif() endif()
...@@ -171,55 +146,6 @@ if(WITH_DISTRIBUTE) ...@@ -171,55 +146,6 @@ if(WITH_DISTRIBUTE)
add_definitions(-DPADDLE_WITH_DISTRIBUTE) add_definitions(-DPADDLE_WITH_DISTRIBUTE)
endif() endif()
if(WITH_GOLANG)
# we need to symlink Paddle directory into GOPATH. If we
# don't do it and we have code that depends on Paddle, go
# get ./... will download a new Paddle repo from Github,
# without the changes in our current Paddle repo that we
# want to build.
set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
file(MAKE_DIRECTORY ${GOPATH})
set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
file(MAKE_DIRECTORY "${PADDLE_IN_GOPATH}")
set(PADDLE_GO_PATH "${CMAKE_SOURCE_DIR}/go")
add_custom_target(go_path)
add_custom_command(TARGET go_path
# Symlink Paddle directory into GOPATH
COMMAND mkdir -p ${PADDLE_IN_GOPATH}
COMMAND rm -rf ${PADDLE_IN_GOPATH}
COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
# Automatically get all dependencies specified in the source code
# We can't run `go get -d ./...` for every target, because
# multiple `go get` can not run concurrently, but make need to be
# able to run with multiple jobs.
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
if (GLIDE_INSTALL)
if(EXISTS $ENV{GOPATH}/bin/glide)
set(GLIDE "$ENV{GOPATH}/bin/glide")
else()
message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
endif()
# this command will only run when the file it depends is missing
# or has changed, or the output is missing.
add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
COMMAND env GOPATH=${GOPATH} ${GLIDE} install
COMMAND touch ${CMAKE_BINARY_DIR}/glide
DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
)
# depends on the custom command which outputs
# ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
# run every time this target is built.
add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
endif()
endif(WITH_GOLANG)
if(WITH_GRPC) if(WITH_GRPC)
add_definitions(-DPADDLE_WITH_GRPC) add_definitions(-DPADDLE_WITH_GRPC)
endif(WITH_GRPC) endif(WITH_GRPC)
......
...@@ -168,10 +168,7 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x ...@@ -168,10 +168,7 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
endif() endif()
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO) if(NOT WITH_DSO)
# TODO(panyx0718): CUPTI only allows DSO?
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
if(WIN32) if(WIN32)
set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY}) set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
endif(WIN32) endif(WIN32)
......
...@@ -74,5 +74,3 @@ add_dependencies(anakin_shared extern_anakin) ...@@ -74,5 +74,3 @@ add_dependencies(anakin_shared extern_anakin)
add_library(anakin_saber SHARED IMPORTED GLOBAL) add_library(anakin_saber SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB}) set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
add_dependencies(anakin_saber extern_anakin) add_dependencies(anakin_saber extern_anakin)
list(APPEND external_project_dependencies anakin_shared anakin_saber)
INCLUDE(ExternalProject)
SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
ExternalProject_Add(
extern_lib_any
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git"
GIT_TAG "15595d8324be9e8a9a80d9ae442fdd12bd66df5d"
PREFIX ${ANY_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
add_library(lib_any STATIC ${dummyfile})
else()
add_library(lib_any INTERFACE)
endif()
add_dependencies(lib_any extern_lib_any)
add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
LIST(APPEND external_project_dependencies lib_any)
...@@ -57,5 +57,4 @@ else() ...@@ -57,5 +57,4 @@ else()
endif() endif()
add_dependencies(boost ${BOOST_PROJECT}) add_dependencies(boost ${BOOST_PROJECT})
list(APPEND external_project_dependencies boost)
set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR}) set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
...@@ -69,5 +69,3 @@ SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) ...@@ -69,5 +69,3 @@ SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
ADD_DEPENDENCIES(brpc extern_brpc) ADD_DEPENDENCIES(brpc extern_brpc)
add_definitions(-DBRPC_WITH_GLOG) add_definitions(-DBRPC_WITH_GLOG)
LIST(APPEND external_project_dependencies brpc)
...@@ -31,5 +31,3 @@ else() ...@@ -31,5 +31,3 @@ else()
endif() endif()
add_dependencies(cub extern_cub) add_dependencies(cub extern_cub)
LIST(APPEND external_project_dependencies cub)
...@@ -27,5 +27,3 @@ else() ...@@ -27,5 +27,3 @@ else()
endif() endif()
add_dependencies(dlpack extern_dlpack) add_dependencies(dlpack extern_dlpack)
LIST(APPEND external_project_dependencies dlpack)
...@@ -52,5 +52,3 @@ else() ...@@ -52,5 +52,3 @@ else()
endif() endif()
add_dependencies(eigen3 extern_eigen3) add_dependencies(eigen3 extern_eigen3)
LIST(APPEND external_project_dependencies eigen3)
...@@ -61,8 +61,6 @@ ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) ...@@ -61,8 +61,6 @@ ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
ADD_DEPENDENCIES(gflags extern_gflags) ADD_DEPENDENCIES(gflags extern_gflags)
LIST(APPEND external_project_dependencies gflags)
# On Windows (including MinGW), the Shlwapi library is used by gflags if available. # On Windows (including MinGW), the Shlwapi library is used by gflags if available.
if (WIN32) if (WIN32)
include(CheckIncludeFileCXX) include(CheckIncludeFileCXX)
......
...@@ -72,5 +72,3 @@ ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) ...@@ -72,5 +72,3 @@ ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
ADD_DEPENDENCIES(glog extern_glog gflags) ADD_DEPENDENCIES(glog extern_glog gflags)
LINK_LIBRARIES(glog gflags) LINK_LIBRARIES(glog gflags)
LIST(APPEND external_project_dependencies glog)
...@@ -79,5 +79,4 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) ...@@ -79,5 +79,4 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES}) SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
ADD_DEPENDENCIES(gtest_main extern_gtest) ADD_DEPENDENCIES(gtest_main extern_gtest)
LIST(APPEND external_project_dependencies gtest gtest_main)
ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
...@@ -39,6 +39,3 @@ ADD_DEPENDENCIES(extern_leveldb snappy) ...@@ -39,6 +39,3 @@ ADD_DEPENDENCIES(extern_leveldb snappy)
ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL) ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES}) SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
ADD_DEPENDENCIES(leveldb extern_leveldb) ADD_DEPENDENCIES(leveldb extern_leveldb)
LIST(APPEND external_project_dependencies leveldb)
...@@ -72,7 +72,4 @@ else() ...@@ -72,7 +72,4 @@ else()
add_library(libmct INTERFACE) add_library(libmct INTERFACE)
endif() endif()
#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL)
ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT}) ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT})
LIST(APPEND external_project_dependencies libmct)
...@@ -53,5 +53,3 @@ MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}") ...@@ -53,5 +53,3 @@ MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
include_directories(${LIBXSMM_INCLUDE_DIR}) include_directories(${LIBXSMM_INCLUDE_DIR})
ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM) ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
ADD_DEPENDENCIES(libxsmm extern_libxsmm) ADD_DEPENDENCIES(libxsmm extern_libxsmm)
LIST(APPEND external_project_dependencies libxsmm)
...@@ -89,7 +89,6 @@ SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) ...@@ -89,7 +89,6 @@ SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT}) ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_WITH_MKLDNN) add_definitions(-DPADDLE_WITH_MKLDNN)
LIST(APPEND external_project_dependencies shared_mkldnn)
# generate a static dummy target to track mkldnn dependencies # generate a static dummy target to track mkldnn dependencies
# for cc_library(xxx SRCS xxx.c DEPS mkldnn) # for cc_library(xxx SRCS xxx.c DEPS mkldnn)
......
...@@ -39,8 +39,10 @@ IF(WIN32) ...@@ -39,8 +39,10 @@ IF(WIN32)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll)
ELSE() ELSE()
SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) #TODO(intel-huying):
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
...@@ -73,4 +75,3 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) ...@@ -73,4 +75,3 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL) ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
LIST(APPEND external_project_dependencies mklml)
...@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs) ...@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph") SET(NGRAPH_PROJECT "extern_ngraph")
SET(NGRAPH_GIT_TAG "20bd8bbc79ae3a81c57313846a2be7313e5d1dab") SET(NGRAPH_GIT_TAG "a444f7a959b7d87f2c117c9b57a4c387759e481e")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
...@@ -69,7 +69,7 @@ ExternalProject_Add( ...@@ -69,7 +69,7 @@ ExternalProject_Add(
CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}
CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
) )
...@@ -77,4 +77,3 @@ add_dependencies(ngraph ${NGRAPH_PROJECT}) ...@@ -77,4 +77,3 @@ add_dependencies(ngraph ${NGRAPH_PROJECT})
target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH) target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH)
target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR}) target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR})
target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB}) target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB})
LIST(APPEND external_project_dependencies ngraph)
...@@ -11,11 +11,6 @@ ...@@ -11,11 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
IF(USE_EIGEN_FOR_BLAS)
return()
ENDIF(USE_EIGEN_FOR_BLAS)
INCLUDE(cblas) INCLUDE(cblas)
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
...@@ -91,7 +86,6 @@ ENDIF() ...@@ -91,7 +86,6 @@ ENDIF()
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
ADD_DEPENDENCIES(cblas extern_openblas) ADD_DEPENDENCIES(cblas extern_openblas)
LIST(APPEND external_project_dependencies cblas)
ELSE() ELSE()
IF("${CBLAS_PROVIDER}" STREQUAL "MKLML") IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
ADD_DEPENDENCIES(cblas mklml) ADD_DEPENDENCIES(cblas mklml)
......
...@@ -129,7 +129,6 @@ macro(PROMPT_PROTOBUF_LIB) ...@@ -129,7 +129,6 @@ macro(PROMPT_PROTOBUF_LIB)
ADD_DEPENDENCIES(protoc ${dep}) ADD_DEPENDENCIES(protoc ${dep})
ENDFOREACH() ENDFOREACH()
LIST(APPEND external_project_dependencies protobuf)
RETURN() RETURN()
endmacro() endmacro()
macro(SET_PROTOBUF_VERSION) macro(SET_PROTOBUF_VERSION)
...@@ -231,7 +230,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -231,7 +230,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
) )
ENDFUNCTION() ENDFUNCTION()
SET(PROTOBUF_VERSION 3.1) SET(PROTOBUF_VERSION 3.1.0)
IF(NOT PROTOBUF_FOUND) IF(NOT PROTOBUF_FOUND)
build_protobuf(extern_protobuf FALSE) build_protobuf(extern_protobuf FALSE)
......
...@@ -70,4 +70,3 @@ ExternalProject_Add( ...@@ -70,4 +70,3 @@ ExternalProject_Add(
ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
LIST(APPEND external_project_dependencies pslib)
...@@ -70,4 +70,3 @@ ExternalProject_Add( ...@@ -70,4 +70,3 @@ ExternalProject_Add(
ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL) ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB}) SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT}) ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
LIST(APPEND external_project_dependencies pslib_brpc)
...@@ -26,5 +26,3 @@ else() ...@@ -26,5 +26,3 @@ else()
endif() endif()
add_dependencies(simple_threadpool extern_threadpool) add_dependencies(simple_threadpool extern_threadpool)
LIST(APPEND external_project_dependencies simple_threadpool)
...@@ -83,5 +83,3 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include wa ...@@ -83,5 +83,3 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include wa
ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL) ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES}) SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
ADD_DEPENDENCIES(warpctc extern_warpctc) ADD_DEPENDENCIES(warpctc extern_warpctc)
LIST(APPEND external_project_dependencies warpctc)
...@@ -55,4 +55,3 @@ else() ...@@ -55,4 +55,3 @@ else()
endif() endif()
add_dependencies(xbyak ${XBYAK_PROJECT}) add_dependencies(xbyak ${XBYAK_PROJECT})
list(APPEND external_project_dependencies xbyak)
...@@ -71,5 +71,3 @@ add_library(xxhash STATIC IMPORTED GLOBAL) ...@@ -71,5 +71,3 @@ add_library(xxhash STATIC IMPORTED GLOBAL)
set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES}) set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
include_directories(${XXHASH_INCLUDE_DIR}) include_directories(${XXHASH_INCLUDE_DIR})
add_dependencies(xxhash extern_xxhash) add_dependencies(xxhash extern_xxhash)
LIST(APPEND external_project_dependencies xxhash)
...@@ -57,5 +57,3 @@ ENDIF(WIN32) ...@@ -57,5 +57,3 @@ ENDIF(WIN32)
ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL) ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
ADD_DEPENDENCIES(zlib extern_zlib) ADD_DEPENDENCIES(zlib extern_zlib)
LIST(APPEND external_project_dependencies zlib)
...@@ -11,8 +11,6 @@ include_directories("/opt/rocm/rocrand/include") ...@@ -11,8 +11,6 @@ include_directories("/opt/rocm/rocrand/include")
include_directories("/opt/rocm/rccl/include") include_directories("/opt/rocm/rccl/include")
include_directories("/opt/rocm/thrust") include_directories("/opt/rocm/thrust")
list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" ) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
if(WITH_DSO) if(WITH_DSO)
...@@ -31,22 +29,12 @@ if(WITH_GRPC) ...@@ -31,22 +29,12 @@ if(WITH_GRPC)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC") set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
endif(WITH_GRPC) endif(WITH_GRPC)
if(NOT WITH_GOLANG)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG")
endif(NOT WITH_GOLANG)
if(WITH_MKLDNN) if(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN") set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
endif(WITH_MKLDNN) endif(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE") set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
if(NOT WITH_RDMA)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA")
endif(NOT WITH_RDMA)
if(CMAKE_BUILD_TYPE STREQUAL "Debug") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
......
...@@ -153,7 +153,11 @@ function(op_library TARGET) ...@@ -153,7 +153,11 @@ function(op_library TARGET)
# pybind USE_OP_DEVICE_KERNEL for CUDNN # pybind USE_OP_DEVICE_KERNEL for CUDNN
list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len) list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0) if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
if(${TARGET} STREQUAL "activation")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
endif() endif()
# pybind USE_OP_DEVICE_KERNEL for MIOPEN # pybind USE_OP_DEVICE_KERNEL for MIOPEN
...@@ -168,6 +172,9 @@ function(op_library TARGET) ...@@ -168,6 +172,9 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op") elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n")
else() else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif() endif()
......
# user should download rdma first from subversion repository
# execute following instruction to download svn mannally
# svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/
# svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
# we use static output in svn repositories to avoid implict bugs from not standard runtime env.
if(WITH_RDMA)
set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
function(generate_rdma_links)
#redirect to current DIR to isolate the pollution from system runtime environment
#it can benifits unified control for different gcc environment.
#e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
#runtime libraries that will crash process while loading it. That redirect trick
#can fix it.
execute_process(
COMMAND mkdir -p librdma
COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so
COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so.1
COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endfunction(generate_rdma_links)
#check and set headers
find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
#check and set libs
find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
if(
RDMA_INC_SXISOCK AND
RDMA_INC_XIO AND
RDMA_INC_EVENT AND
RDMA_INC_NUMA AND
RDMA_LIB_SXISOCK AND
RDMA_LIB_XIO AND
RDMA_LIB_EVENT AND
RDMA_LIB_EVENT_CORE AND
RDMA_LIB_EVENT_EXTRA AND
RDMA_LIB_EVENT_PTHREADS AND
RDMA_LIB_NUMA
)
set(RDMA_INC_DIR
${RDMA_INC_SXISOCK}
${RDMA_INC_XIO}
${RDMA_INC_EVENT}
${RDMA_INC_NUMA})
set(RDMA_LIBS
${RDMA_LIB_SXISOCK}
${RDMA_LIB_XIO}
${RDMA_LIB_EVENT}
${RDMA_LIB_EVENT_CORE}
${RDMA_LIB_EVENT_EXTRA}
${RDMA_LIB_EVENT_PTHREADS}
${RDMA_LIB_NUMA}
)
set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
include_directories("${RDMA_INC_DIR}")
else()
#if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
endif()
else(WITH_RDMA)
set(RDMA_LIBS "")
set(RDMA_LD_FLAGS "")
add_definitions(-DPADDLE_DISABLE_RDMA)
endif(WITH_RDMA)
...@@ -33,6 +33,5 @@ if(TENSORRT_FOUND) ...@@ -33,6 +33,5 @@ if(TENSORRT_FOUND)
message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
"Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
include_directories(${TENSORRT_INCLUDE_DIR}) include_directories(${TENSORRT_INCLUDE_DIR})
list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
add_definitions(-DPADDLE_WITH_TENSORRT) add_definitions(-DPADDLE_WITH_TENSORRT)
endif() endif()
...@@ -14,9 +14,7 @@ cmake .. -DWITH_AVX=OFF \ ...@@ -14,9 +14,7 @@ cmake .. -DWITH_AVX=OFF \
-DWITH_MKL=OFF \ -DWITH_MKL=OFF \
-DWITH_GPU=ON \ -DWITH_GPU=ON \
-DWITH_TESTING=ON \ -DWITH_TESTING=ON \
-DWITH_TIMER=ON \
-DWITH_PROFILER=ON \ -DWITH_PROFILER=ON \
-DWITH_FLUID_ONLY=ON
make -j `nproc` make -j `nproc`
pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)" pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)"
......
此差异已折叠。
...@@ -244,6 +244,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, ...@@ -244,6 +244,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
auto& block = main_program.Block(0); auto& block = main_program.Block(0);
for (auto var_name : fetch_var_names) { for (auto var_name : fetch_var_names) {
auto var_desc = block.FindVar(var_name); auto var_desc = block.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var_desc, "%s is not found.", var_name);
auto shapes = var_desc->GetShape(); auto shapes = var_desc->GetShape();
PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1, PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
"var %s: Fetched var has wrong shape, " "var %s: Fetched var has wrong shape, "
......
...@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include <queue> #include <queue>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
...@@ -155,6 +159,16 @@ void BlockDesc::RemoveOp(size_t s, size_t e) { ...@@ -155,6 +159,16 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
ops_.erase(ops_.begin() + s, ops_.begin() + e); ops_.erase(ops_.begin() + s, ops_.begin() + e);
} }
void BlockDesc::RemoveOpInternal(const OpDesc *op_desc) {
// TODO(minqiyang): make this faster
for (auto it = ops_.begin(); it != ops_.end(); ++it) {
if (it->get() == op_desc) {
ops_.erase(it);
break;
}
}
}
std::vector<OpDesc *> BlockDesc::AllOps() const { std::vector<OpDesc *> BlockDesc::AllOps() const {
std::vector<OpDesc *> res; std::vector<OpDesc *> res;
for (const auto &op : ops_) { for (const auto &op : ops_) {
......
...@@ -93,6 +93,8 @@ class BlockDesc { ...@@ -93,6 +93,8 @@ class BlockDesc {
*/ */
void RemoveOp(size_t s, size_t e); void RemoveOp(size_t s, size_t e);
void RemoveOpInternal(const OpDesc *op_desc);
void RemoveVar(const std::string &name) { vars_.erase(name); } void RemoveVar(const std::string &name) { vars_.erase(name); }
std::vector<OpDesc *> AllOps() const; std::vector<OpDesc *> AllOps() const;
......
...@@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, ...@@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
out_layout = out_layout =
out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
auto& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
pool.Get(expected_kernel_type.place_));
auto& cpu_engine = dev_ctx->GetEngine();
std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims()); std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
std::vector<int> out_tz = in_tz; std::vector<int> out_tz = in_tz;
...@@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, ...@@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
"Input tensor type is not supported: %s", in.type()); "Input tensor type is not supported: %s", in.type());
memory::data_type out_type = in_type; memory::data_type out_type = in_type;
auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
auto out_format =
platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
// output tensor has the same dims as input. Reorder don't change dims // output tensor has the same dims as input. Reorder don't change dims
out->Resize(in.dims()); out->Resize(in.dims());
if (in_format != out_format) { // tempory mem pd fr out , to make reorder
auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(out->dims()),
mkldnn::memory::format::blocked, out_type);
if (in.get_mkldnn_prim_desc() != out_mem_pd) {
void* in_data = GetDataFromTensor(in, in_type); void* in_data = GetDataFromTensor(in, in_type);
auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
auto in_memory = auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data);
memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); auto out_memory = memory(out_mem_pd, out_data);
auto out_memory =
memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
platform::Reorder(in_memory, out_memory); platform::Reorder(in_memory, out_memory);
} else { } else {
out->ShareDataWith(in); out->ShareDataWith(in);
} }
out->set_layout(out_layout); out->set_layout(out_layout);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel
out->set_format(memory::format::format_undef);
#endif #endif
} }
......
...@@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type, ...@@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type,
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur // Just set layout/format. No real transform occur
auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
ToMKLDNNFormat(lin));
out.ShareDataWith(input_tensor); out.ShareDataWith(input_tensor);
out.set_layout(DataLayout::kMKLDNN); // TODO(jczaja): Remove that once all mkldnn ops
out.set_format(out_format); // are modified to work with mkldnn_blocked
auto mkldnn_fmt = [&](int rank) {
switch (rank) {
case 5:
return mkldnn::memory::format::ncdhw;
case 4:
return mkldnn::memory::format::nchw;
case 3:
return mkldnn::memory::format::ncw;
case 2:
return mkldnn::memory::format::nc;
case 1:
return mkldnn::memory::format::x;
default:
return mkldnn::memory::format::blocked;
}
};
auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(out.dims()),
mkldnn_fmt(out.dims().size()));
out.set_mkldnn_prim_desc(out_mem_pd);
#endif #endif
} else { } else {
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
......
...@@ -50,7 +50,12 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ ...@@ -50,7 +50,12 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper) if(WITH_GPU)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
else()
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
endif()
cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
......
...@@ -30,8 +30,6 @@ namespace paddle { ...@@ -30,8 +30,6 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
static constexpr char kAllOpDescs[] = "all_op_descs";
VarHandle* GetValidInput(const OpHandleBase* a) { VarHandle* GetValidInput(const OpHandleBase* a) {
for (auto p : a->Inputs()) { for (auto p : a->Inputs()) {
VarHandle* b = dynamic_cast<VarHandle*>(p); VarHandle* b = dynamic_cast<VarHandle*>(p);
...@@ -52,7 +50,7 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl( ...@@ -52,7 +50,7 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
std::unordered_map<std::string, int> vars; std::unordered_map<std::string, int> vars;
// TODO(gongwb): use graph topology sort to find the order of operators. // TODO(gongwb): use graph topology sort to find the order of operators.
// Note that must assert topology sort is stable // Note that must assert topology sort is stable
auto& ops = Get<const std::vector<OpDesc*>>(kAllOpDescs); auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
for (auto* op_desc : ops) { for (auto* op_desc : ops) {
auto outputs = op_desc->Outputs(); auto outputs = op_desc->Outputs();
for (auto& o_it : outputs) { for (auto& o_it : outputs) {
...@@ -122,4 +120,4 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl( ...@@ -122,4 +120,4 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
REGISTER_PASS(all_reduce_deps_pass, REGISTER_PASS(all_reduce_deps_pass,
paddle::framework::details::AllReduceDepsPass) paddle::framework::details::AllReduceDepsPass)
.RequirePassAttr(paddle::framework::details::kAllOpDescs); .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
...@@ -53,7 +53,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -53,7 +53,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif #endif
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); platform::RecordEvent record_event(Name());
WaitInputVarGenerated(); WaitInputVarGenerated();
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs()); auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
......
...@@ -22,7 +22,7 @@ namespace framework { ...@@ -22,7 +22,7 @@ namespace framework {
namespace details { namespace details {
void BroadcastOpHandle::RunImpl() { void BroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); platform::RecordEvent record_event(Name());
if (places_.size() == 1) return; if (places_.size() == 1) return;
...@@ -30,7 +30,7 @@ void BroadcastOpHandle::RunImpl() { ...@@ -30,7 +30,7 @@ void BroadcastOpHandle::RunImpl() {
VarHandle *in_var_handle; VarHandle *in_var_handle;
{ {
auto in_var_handles = DynamicCast<VarHandle>(inputs_); auto in_var_handles = DynamicCast<VarHandle>(inputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1, PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one."); "The number of input should be one.");
in_var_handle = in_var_handles[0]; in_var_handle = in_var_handles[0];
} }
......
...@@ -34,9 +34,11 @@ namespace details { ...@@ -34,9 +34,11 @@ namespace details {
static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
// Should fix the allreduce op order if scheduling // Should fix the allreduce op order if scheduling
// them in multiple threads or processes to avoid hang. // them in multiple threads or processes to avoid hang.
// NOTE: ParallelGraph would execute this pass on each graph, so
// don't need to append it here.
return (!strategy.enable_sequential_execution_ && return (!strategy.enable_sequential_execution_ &&
strategy.num_trainers_ > 1) || strategy.num_trainers_ > 1) &&
strategy.enable_parallel_graph_; !strategy.enable_parallel_graph_;
} }
class ParallelExecutorPassBuilder : public ir::PassBuilder { class ParallelExecutorPassBuilder : public ir::PassBuilder {
...@@ -133,12 +135,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -133,12 +135,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
void AppendMultiDevPass(const BuildStrategy &strategy) { void AppendMultiDevPass(const BuildStrategy &strategy) {
ir::Pass *multi_devices_pass; ir::Pass *multi_devices_pass;
if (strategy_.is_distribution_) { if (strategy_.is_distribution_) {
VLOG(3) << "multi device parameter server mode";
multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
} else { } else {
if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
VLOG(3) << "multi devices collective mode with allreduce";
multi_devices_pass = multi_devices_pass =
AppendPass("allreduce_mode_multi_devices_pass").get(); AppendPass("allreduce_mode_multi_devices_pass").get();
} else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
VLOG(3) << "multi deivces collective mode with reduce";
multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
} else { } else {
PADDLE_THROW("Unknown reduce strategy."); PADDLE_THROW("Unknown reduce strategy.");
...@@ -169,7 +174,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const { ...@@ -169,7 +174,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
} }
std::unique_ptr<ir::Graph> BuildStrategy::Apply( std::unique_ptr<ir::Graph> BuildStrategy::Apply(
const ProgramDesc &main_program, const std::vector<platform::Place> &places, std::unique_ptr<ir::Graph> graph,
const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::vector<Scope *> &local_scopes, const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
const size_t &nranks, const size_t &nranks,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
...@@ -180,7 +186,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -180,7 +186,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
// Create a default one if not finalized by user. // Create a default one if not finalized by user.
CreatePassesFromStrategy(false); CreatePassesFromStrategy(false);
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) { for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
if (IsMultiDevPass(pass->Type())) { if (IsMultiDevPass(pass->Type())) {
pass->Erase(kPlaces); pass->Erase(kPlaces);
...@@ -198,41 +203,12 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -198,41 +203,12 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass->Erase("nccl_ctxs"); pass->Erase("nccl_ctxs");
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx); pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif #endif
} else if (pass->Type() == "memory_optimize_pass") {
if (graph->Has(kAllOpDescs)) {
graph->Erase(kAllOpDescs);
}
const std::vector<OpDesc *> *all_op_descs =
new std::vector<OpDesc *>(main_program.Block(0).AllOps());
graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
all_op_descs); // take ownership
pass->Erase(kAllOpDescs);
pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
} else if (pass->Type() == "sequential_execution_pass") { } else if (pass->Type() == "sequential_execution_pass") {
LOG(INFO) << "set enable_sequential_execution:" LOG(INFO) << "set enable_sequential_execution:"
<< enable_sequential_execution_; << enable_sequential_execution_;
pass->Erase(kAllOpDescs);
pass->Set<const std::vector<OpDesc *>>(
kAllOpDescs,
new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
} else if (pass->Type() == "all_reduce_deps_pass") { } else if (pass->Type() == "all_reduce_deps_pass") {
LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this) LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
<< ", num_trainers:" << num_trainers_; << ", num_trainers:" << num_trainers_;
pass->Erase(kAllOpDescs);
pass->Set<const std::vector<OpDesc *>>(
kAllOpDescs,
new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
} else if (pass->Type() == "inplace_pass") {
if (graph->Has(kAllOpDescs)) {
graph->Erase(kAllOpDescs);
}
graph->Set<const std::vector<OpDesc *>>(
kAllOpDescs,
new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
} else if (pass->Type() == "fuse_relu_depthwise_conv_pass") { } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
if (!use_cuda) { if (!use_cuda) {
LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on " LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
...@@ -240,7 +216,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply( ...@@ -240,7 +216,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
continue; continue;
} }
} }
VLOG(3) << "Start Apply Pass " << pass->Type();
graph = pass->Apply(std::move(graph)); graph = pass->Apply(std::move(graph));
VLOG(3) << "Finish Apply Pass " << pass->Type();
} }
return graph; return graph;
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -76,11 +77,11 @@ struct BuildStrategy { ...@@ -76,11 +77,11 @@ struct BuildStrategy {
bool fuse_relu_depthwise_conv_{false}; bool fuse_relu_depthwise_conv_{false};
bool memory_optimize_{false}; bool memory_optimize_{true};
// TODO(dzhwinter): // TODO(dzhwinter):
// make enable_inplace, memory_optimize_ // make enable_inplace, memory_optimize_
// memory_early_delete_ true by default // memory_early_delete_ true by default
bool enable_inplace_{false}; bool enable_inplace_{true};
bool enable_sequential_execution_{false}; bool enable_sequential_execution_{false};
...@@ -114,7 +115,7 @@ struct BuildStrategy { ...@@ -114,7 +115,7 @@ struct BuildStrategy {
// Apply the passes built by the pass_builder_. The passes will be // Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph. // applied to the Program and output an ir::Graph.
std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program, std::unique_ptr<ir::Graph> Apply(std::unique_ptr<ir::Graph> graph,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::string &loss_var_name,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
......
...@@ -86,7 +86,7 @@ std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan( ...@@ -86,7 +86,7 @@ std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
} }
void DataBalanceOpHandle::RunImpl() { void DataBalanceOpHandle::RunImpl() {
PADDLE_ENFORCE_GT(places_.size(), 1, PADDLE_ENFORCE_GT(places_.size(), 1UL,
"Data balance can only be enabled when the number of " "Data balance can only be enabled when the number of "
"places to run larger than 1."); "places to run larger than 1.");
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs()); auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
...@@ -24,12 +26,11 @@ namespace details { ...@@ -24,12 +26,11 @@ namespace details {
FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes, const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places, ir::Graph *graph)
std::unique_ptr<ir::Graph> &&graph)
: strategy_(strategy), : strategy_(strategy),
local_scopes_(local_scopes), local_scopes_(local_scopes),
places_(places), places_(places),
graph_(std::move(graph)), graph_(graph),
pool_(strategy.num_threads_), pool_(strategy.num_threads_),
prepare_pool_(1), // add one more thread for generate op_deps prepare_pool_(1), // add one more thread for generate op_deps
fetch_ctxs_(places) { fetch_ctxs_(places) {
...@@ -56,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ...@@ -56,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
std::vector<FetchOpHandle *> fetch_ops; std::vector<FetchOpHandle *> fetch_ops;
for (auto &fetch_var_name : fetch_tensors) { for (auto &fetch_var_name : fetch_tensors) {
for (auto &var_map : graph_->Get<details::GraphVars>("vars")) { for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
auto it = var_map.find(fetch_var_name); auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) { if (it != var_map.end()) {
fetched_vars[fetch_var_name].push_back(*it->second.rbegin()); fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
...@@ -110,14 +111,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ...@@ -110,14 +111,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
} }
} }
if (exception_.IsCaught()) { if (exception_.IsCaught()) {
ClearFetchOp(graph_.get(), &fetch_ops); ClearFetchOp(graph_, &fetch_ops);
exception_.ReThrow(); exception_.ReThrow();
} }
} }
num_complete += num_comp; num_complete += num_comp;
} }
// Wait FetchOps. // Wait FetchOps.
ClearFetchOp(graph_.get(), &fetch_ops); ClearFetchOp(graph_, &fetch_ops);
return fetches; return fetches;
} }
......
...@@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
std::unique_ptr<ir::Graph> &&graph); ir::Graph *graph);
FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override; FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
const ir::Graph &Graph() const override; const ir::Graph &Graph() const override;
...@@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
ExecutionStrategy strategy_; ExecutionStrategy strategy_;
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
std::unique_ptr<ir::Graph> graph_; ir::Graph *graph_;
std::unordered_map<OpHandleBase *, int> op_deps_; std::unordered_map<OpHandleBase *, int> op_deps_;
std::vector<OpHandleBase *> bootstrap_ops_; std::vector<OpHandleBase *> bootstrap_ops_;
......
...@@ -23,7 +23,7 @@ void FuseVarsOpHandle::RunImpl() { ...@@ -23,7 +23,7 @@ void FuseVarsOpHandle::RunImpl() {
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs()); auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(in_var_handles.size(), 0); PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), ""); PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(); auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
......
...@@ -22,7 +22,7 @@ namespace framework { ...@@ -22,7 +22,7 @@ namespace framework {
namespace details { namespace details {
void FusedBroadcastOpHandle::RunImpl() { void FusedBroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); platform::RecordEvent record_event(Name());
if (places_.size() == 1UL) return; if (places_.size() == 1UL) return;
......
...@@ -34,8 +34,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -34,8 +34,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
->Var(details::kLocalExecScopeName) ->Var(details::kLocalExecScopeName)
->GetMutable<Scope*>() = &local_scope; ->GetMutable<Scope*>() = &local_scope;
for (size_t j = 0; j < input_scope_idxes.size(); ++j) { for (size_t j = 0; j < input_scope_idxes.size(); ++j) {
local_scope.Var("out_var" + j); local_scope.Var("out_var" + std::to_string(j));
if (i == j) local_scope.Var("in_var" + j); if (i == j) local_scope.Var("in_var" + std::to_string(j));
} }
param_scopes_.emplace_back(&local_scope); param_scopes_.emplace_back(&local_scope);
} }
...@@ -62,20 +62,21 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -62,20 +62,21 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
for (size_t i = 0; i < input_scope_idxes.size(); ++i) { for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
// add input var handle // add input var handle
nodes_.emplace_back( nodes_.emplace_back(ir::CreateNodeForTest("in_node" + std::to_string(i),
ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable)); ir::Node::Type::kVariable));
VarHandle* in_var_handle = VarHandle* in_var_handle = new VarHandle(
new VarHandle(nodes_.back().get(), 1, input_scope_idxes[i], nodes_.back().get(), 1, input_scope_idxes[i],
"in_var" + i, place_list_[input_scope_idxes[i]]); "in_var" + std::to_string(i), place_list_[input_scope_idxes[i]]);
vars_.emplace_back(in_var_handle); vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle); op_handle_->AddInput(in_var_handle);
// add output var handle // add output var handle
for (size_t j = 0; j < place_list_.size(); ++j) { for (size_t j = 0; j < place_list_.size(); ++j) {
nodes_.emplace_back( nodes_.emplace_back(ir::CreateNodeForTest(
ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable)); "out_node" + std::to_string(i), ir::Node::Type::kVariable));
VarHandle* out_var_handle = new VarHandle( VarHandle* out_var_handle =
nodes_.back().get(), 2, j, "out_var" + i, place_list_[j]); new VarHandle(nodes_.back().get(), 2, j,
"out_var" + std::to_string(i), place_list_[j]);
vars_.emplace_back(out_var_handle); vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle); op_handle_->AddOutput(out_var_handle);
} }
...@@ -86,7 +87,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -86,7 +87,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
std::vector<std::vector<float>> send_vec; std::vector<std::vector<float>> send_vec;
f::LoD lod{{0, 10, 20}}; f::LoD lod{{0, 10, 20}};
for (size_t i = 0; i < input_scope_idxes.size(); ++i) { for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string varname("in_var" + i); const std::string varname("in_var" + std::to_string(i));
float val_scalar = static_cast<float>(i); float val_scalar = static_cast<float>(i);
send_vec.push_back( send_vec.push_back(
InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar)); InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
...@@ -96,7 +97,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -96,7 +97,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
WaitAll(); WaitAll();
for (size_t i = 0; i < input_scope_idxes.size(); ++i) { for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string& varname("out_var" + i); const std::string& varname("out_var" + std::to_string(i));
for (size_t j = 0; j < place_list_.size(); ++j) { for (size_t j = 0; j < place_list_.size(); ++j) {
LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]); LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]);
} }
...@@ -109,7 +110,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -109,7 +110,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; 2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
int height = static_cast<int>(kDims[0] * 2); int height = static_cast<int>(kDims[0] * 2);
for (size_t i = 0; i < input_scope_idxes.size(); ++i) { for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string varname("in_var" + i); const std::string varname("in_var" + std::to_string(i));
float val_scalar = static_cast<float>(i); float val_scalar = static_cast<float>(i);
send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i], send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i],
rows, height, val_scalar)); rows, height, val_scalar));
...@@ -119,7 +120,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -119,7 +120,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
WaitAll(); WaitAll();
for (size_t i = 0; i < input_scope_idxes.size(); ++i) { for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string& varname("out_var" + i); const std::string& varname("out_var" + std::to_string(i));
for (size_t j = 0; j < place_list_.size(); ++j) { for (size_t j = 0; j < place_list_.size(); ++j) {
SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows, SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows,
height); height);
......
...@@ -49,7 +49,7 @@ DEFINE_bool( ...@@ -49,7 +49,7 @@ DEFINE_bool(
"If this option turns on, only these op in whitelist can be inplaced." "If this option turns on, only these op in whitelist can be inplaced."
"If it turns off, all of the running op can be candidate of inplaced op." "If it turns off, all of the running op can be candidate of inplaced op."
"Such as scale, elementwise_add" "Such as scale, elementwise_add"
"By default, it's turned on"); "By default, it's turned off");
DECLARE_string(memory_optimize_debug); DECLARE_string(memory_optimize_debug);
......
...@@ -13,13 +13,22 @@ ...@@ -13,13 +13,22 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm>
#include <deque> #include <deque>
#include <functional> #include <functional>
#include <iostream> #include <iterator>
#include <numeric> #include <numeric>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif // PADDLE_WITH_CUDA
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -27,10 +36,10 @@ namespace details { ...@@ -27,10 +36,10 @@ namespace details {
using paddle::framework::VarDesc; using paddle::framework::VarDesc;
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) { std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
PADDLE_ENFORCE(graph.Has(kAllOpDescs), PADDLE_ENFORCE(graph.Has(kStaleProgramOpDescs),
"Graph has no attribute of kAllOpDescs."); "Graph has no attribute of kStaleProgramOpDescs.");
// 1. get op desc order // 1. get op desc order
auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs); auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
// 2. topology sort order // 2. topology sort order
auto nodes = graph.Nodes(); auto nodes = graph.Nodes();
...@@ -123,7 +132,13 @@ size_t NodeSize(const VarDesc& node) { ...@@ -123,7 +132,13 @@ size_t NodeSize(const VarDesc& node) {
} }
size_t NodeSize(ir::Node* n) { size_t NodeSize(ir::Node* n) {
auto* desc = FindVarDescInBlock(n); VarDesc* desc = nullptr;
// some op do not have block pointer
if (n->inputs[0]->Op() != nullptr) {
desc = FindVarDescInBlock(n);
} else {
desc = n->Var();
}
return NodeSize(*desc); return NodeSize(*desc);
} }
...@@ -166,6 +181,11 @@ struct NodeComparator { ...@@ -166,6 +181,11 @@ struct NodeComparator {
bool operator()(ir::Node* lhs, ir::Node* rhs) const { bool operator()(ir::Node* lhs, ir::Node* rhs) const {
auto* lhs_desc = FindVarDescInBlock(lhs); auto* lhs_desc = FindVarDescInBlock(lhs);
auto* rhs_desc = FindVarDescInBlock(rhs); auto* rhs_desc = FindVarDescInBlock(rhs);
// match data type
if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
return false;
}
// match shape
auto lhs_shape = lhs_desc->GetShape(); auto lhs_shape = lhs_desc->GetShape();
auto rhs_shape = rhs_desc->GetShape(); auto rhs_shape = rhs_desc->GetShape();
if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
...@@ -230,6 +250,27 @@ ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const { ...@@ -230,6 +250,27 @@ ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
return found_node; return found_node;
} }
ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const {
ir::Node* found_node = nullptr;
NodeComparator functor;
auto it =
std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) {
if (v.front() == prev)
return true;
else
return false;
});
PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!");
for (it = std::next(it); it != nodes_.end(); ++it) {
auto& candidate = it->front();
if (functor(var, candidate)) {
found_node = candidate;
break;
}
}
return found_node;
}
bool OrderedSet::Has(ir::Node* var) const { bool OrderedSet::Has(ir::Node* var) const {
if (mark_table_.count(var->Name())) { if (mark_table_.count(var->Name())) {
auto& node_in_samename = mark_table_.at(var->Name()); auto& node_in_samename = mark_table_.at(var->Name());
...@@ -241,10 +282,15 @@ bool OrderedSet::Has(ir::Node* var) const { ...@@ -241,10 +282,15 @@ bool OrderedSet::Has(ir::Node* var) const {
return false; return false;
} }
void OrderedSet::Erase(const std::string& var) {
PADDLE_ENFORCE(mark_table_.count(var));
nodes_.erase(mark_table_[var]);
mark_table_.erase(var);
}
void OrderedSet::Erase(ir::Node* var) { void OrderedSet::Erase(ir::Node* var) {
PADDLE_ENFORCE(mark_table_.count(var->Name())); PADDLE_ENFORCE(var != nullptr);
nodes_.erase(mark_table_[var->Name()]); Erase(var->Name());
mark_table_.erase(var->Name());
} }
std::string OrderedSet::ToString() const { std::string OrderedSet::ToString() const {
...@@ -259,7 +305,10 @@ std::string OrderedSet::ToString() const { ...@@ -259,7 +305,10 @@ std::string OrderedSet::ToString() const {
bool NodeCanReused(ir::Node* node) { bool NodeCanReused(ir::Node* node) {
// valid the node is a var node // valid the node is a var node
if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false; // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
if (node == nullptr || !node->IsVar() || node->IsCtrlVar() ||
node->Name() == kEmptyVarName)
return false;
bool flag = true; bool flag = true;
// op output force generated in cpu, can not be reused. // op output force generated in cpu, can not be reused.
...@@ -274,20 +323,37 @@ bool NodeCanReused(ir::Node* node) { ...@@ -274,20 +323,37 @@ bool NodeCanReused(ir::Node* node) {
return flag; return flag;
} }
int MinChunkSize() {
int size{0};
#ifdef PADDLE_WITH_CUDA
size = platform::GpuMinChunkSize();
#else
size = platform::CpuMinChunkSize();
#endif // PADDLE_WITH_CUDA
return size;
}
bool NodeCanReused(const VarDesc& node) { bool NodeCanReused(const VarDesc& node) {
auto type = node.GetType(); auto type = node.GetType();
// only these types holds bulk of gpu memory
if (!(type == proto::VarType::LOD_TENSOR || if (!(type == proto::VarType::LOD_TENSOR ||
type == proto::VarType::SELECTED_ROWS || type == proto::VarType::SELECTED_ROWS ||
type == proto::VarType::LOD_TENSOR_ARRAY)) { type == proto::VarType::LOD_TENSOR_ARRAY)) {
return false; return false;
} }
if (node.Persistable() || node.GetShape().empty()) { // persistable variable is parameter
if (node.Persistable()) {
return false; return false;
} }
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad // shape < min_chunk_size is meaningless.
std::string name = node.Name(); // further more, fetched loss always has size = 1
if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') // which should not be reused.
auto shape = node.GetShape();
int size = std::abs(
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
if (shape.empty() || size < MinChunkSize()) {
return false; return false;
}
return true; return true;
} }
...@@ -397,11 +463,21 @@ void ControlFlowGraph::LiveVariableAnalysis() { ...@@ -397,11 +463,21 @@ void ControlFlowGraph::LiveVariableAnalysis() {
} }
} }
} }
for (auto* op : ops_) {
unlived_vars_[op] = std::set<std::string>();
for (auto& var : this->LiveIn(op)) {
if (!this->LiveOut(op).count(var)) {
unlived_vars_[op].insert(var);
}
}
}
} }
void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node, const std::string& new_node,
int begin_idx) { int begin_idx) {
std::vector<bool> need_update(ops_.size(), false);
// update graph from begin idx to the end // update graph from begin idx to the end
for (size_t i = begin_idx; i != ops_.size(); ++i) { for (size_t i = begin_idx; i != ops_.size(); ++i) {
auto* op = ops_[i]; auto* op = ops_[i];
...@@ -416,15 +492,27 @@ void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, ...@@ -416,15 +492,27 @@ void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
if (live_in_[op].find(old_node) != live_in_[op].end()) { if (live_in_[op].find(old_node) != live_in_[op].end()) {
live_in_[op].erase(old_node); live_in_[op].erase(old_node);
live_in_[op].insert(new_node); live_in_[op].insert(new_node);
need_update[i] = true;
} }
if (live_out_[op].find(old_node) != live_out_[op].end()) { if (live_out_[op].find(old_node) != live_out_[op].end()) {
live_out_[op].erase(old_node); live_out_[op].erase(old_node);
live_out_[op].insert(new_node); live_out_[op].insert(new_node);
need_update[i] = true;
}
}
for (size_t i = begin_idx; i < ops_.size(); ++i) {
if (!need_update[i]) continue;
auto* op = ops_[i];
for (auto& var : this->LiveIn(op)) {
if (!this->LiveOut(op).count(var)) {
unlived_vars_[op].insert(var);
}
} }
} }
} }
const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const { const std::set<std::string>& ControlFlowGraph::LiveIn(ir::Node* op) const {
auto it = live_in_.find(op); auto it = live_in_.find(op);
PADDLE_ENFORCE( PADDLE_ENFORCE(
it != live_in_.end(), it != live_in_.end(),
...@@ -432,7 +520,7 @@ const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const { ...@@ -432,7 +520,7 @@ const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
return it->second; return it->second;
} }
const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const { const std::set<std::string>& ControlFlowGraph::LiveOut(ir::Node* op) const {
auto it = live_out_.find(op); auto it = live_out_.find(op);
PADDLE_ENFORCE( PADDLE_ENFORCE(
it != live_out_.end(), it != live_out_.end(),
...@@ -440,15 +528,24 @@ const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const { ...@@ -440,15 +528,24 @@ const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
return it->second; return it->second;
} }
const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const { const std::set<std::string>& ControlFlowGraph::Use(ir::Node* op) const {
auto it = uses_.find(op); auto it = uses_.find(op);
PADDLE_ENFORCE( PADDLE_ENFORCE(
it != uses_.end(), it != uses_.end(),
string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); string::Sprintf("Expect %s in use, but Not Found.", op->Name()));
return it->second;
}
const std::set<std::string>& ControlFlowGraph::Unlived(ir::Node* op) const {
auto it = unlived_vars_.find(op);
PADDLE_ENFORCE(
it != unlived_vars_.end(),
string::Sprintf("Expect %s in unlived_set, but Not Found.", op->Name()));
return it->second;
return it->second; return it->second;
} }
const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; } const std::vector<ir::Node*>& ControlFlowGraph::Ops() const { return ops_; }
std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; } std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
...@@ -461,7 +558,9 @@ ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name, ...@@ -461,7 +558,9 @@ ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name,
for (auto* node : ops_) { for (auto* node : ops_) {
if (node == op) break; if (node == op) break;
for (auto& output : node->outputs) { for (auto& output : node->outputs) {
if (output->Name() == name) { PADDLE_ENFORCE((output != nullptr && output->IsVar()),
"Output is empty!");
if (output->Var() && output->Name() == name) {
found_node = output; found_node = output;
} }
} }
......
...@@ -29,8 +29,6 @@ namespace paddle { ...@@ -29,8 +29,6 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph); std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
// NOTE(dzh): A ordered set for node reuse in memory optimize. // NOTE(dzh): A ordered set for node reuse in memory optimize.
...@@ -55,6 +53,7 @@ class OrderedSet { ...@@ -55,6 +53,7 @@ class OrderedSet {
void Insert(ir::Node* var); void Insert(ir::Node* var);
void Erase(ir::Node* var); void Erase(ir::Node* var);
void Erase(const std::string& var);
bool Has(ir::Node* var) const; bool Has(ir::Node* var) const;
void Clear() { void Clear() {
mark_table_.clear(); mark_table_.clear();
...@@ -62,6 +61,7 @@ class OrderedSet { ...@@ -62,6 +61,7 @@ class OrderedSet {
} }
// find the bestfit shape node block with var. // find the bestfit shape node block with var.
ir::Node* FindBestFitNode(ir::Node* var) const; ir::Node* FindBestFitNode(ir::Node* var) const;
ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const;
// map store non-const iterator, can not promise const // map store non-const iterator, can not promise const
int GetNodeIndexInPool(ir::Node* var); int GetNodeIndexInPool(ir::Node* var);
// pool all node to string // pool all node to string
...@@ -92,10 +92,11 @@ class ControlFlowGraph { ...@@ -92,10 +92,11 @@ class ControlFlowGraph {
void RenameVarInCFGGraph(const std::string& old_node, void RenameVarInCFGGraph(const std::string& old_node,
const std::string& new_node, int begin_idx); const std::string& new_node, int begin_idx);
const std::set<std::string> LiveIn(ir::Node* op) const; const std::set<std::string>& LiveIn(ir::Node* op) const;
const std::set<std::string> LiveOut(ir::Node* op) const; const std::set<std::string>& LiveOut(ir::Node* op) const;
const std::set<std::string> Use(ir::Node* op) const; const std::set<std::string>& Use(ir::Node* op) const;
const std::vector<ir::Node*> Ops() const; const std::set<std::string>& Unlived(ir::Node* op) const;
const std::vector<ir::Node*>& Ops() const;
std::vector<ir::Node*>& Ops(); std::vector<ir::Node*>& Ops();
// for ssa-graph nodes // for ssa-graph nodes
...@@ -117,6 +118,7 @@ class ControlFlowGraph { ...@@ -117,6 +118,7 @@ class ControlFlowGraph {
VarSetMap live_out_; VarSetMap live_out_;
VarSetMap uses_; // op inputs VarSetMap uses_; // op inputs
VarSetMap defs_; // op outputs VarSetMap defs_; // op outputs
std::unordered_map<ir::Node*, std::set<std::string>> unlived_vars_;
std::vector<ir::Node*> ops_; // op sequence by topology sort std::vector<ir::Node*> ops_; // op sequence by topology sort
}; };
......
...@@ -107,6 +107,52 @@ TEST(OrderedSet, Normal) { ...@@ -107,6 +107,52 @@ TEST(OrderedSet, Normal) {
ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2] ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2]
} }
} }
TEST(OrderedSet, FindBestFitNode) {
OrderedSet pool;
std::vector<std::unique_ptr<ir::Node>> nodes;
ProgramDesc prog;
BlockDesc* block_desc = prog.MutableBlock(0);
auto* op_desc = block_desc->AppendOp();
op_desc->SetType("dummy");
std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
{
auto desc = block_desc->Var("a");
desc->SetShape({128, 128});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
{
auto desc = block_desc->Var("b");
desc->SetShape({128, 129});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
{
auto desc = block_desc->Var("c");
desc->SetShape({128, 128});
std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
node->inputs.emplace_back(op.get());
nodes.emplace_back(std::move(node));
}
for (auto& node : nodes) {
pool.Insert(node.get());
}
// FindNextBestFitNode
auto* n = nodes[0].get();
auto* cache = pool.FindBestFitNode(n);
PADDLE_ENFORCE(cache->Name() == "a");
cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "c");
cache = pool.FindNextBestFitNode(n, cache);
PADDLE_ENFORCE(cache->Name() == "b");
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -182,9 +228,6 @@ TEST(CFGGraph, IRGraph) { ...@@ -182,9 +228,6 @@ TEST(CFGGraph, IRGraph) {
// prepare ir graph // prepare ir graph
auto prog = FillProgramDesc(); auto prog = FillProgramDesc();
ir::Graph graph(prog); ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
ControlFlowGraph cfg(graph); ControlFlowGraph cfg(graph);
cfg.LiveVariableAnalysis(); cfg.LiveVariableAnalysis();
...@@ -210,9 +253,6 @@ TEST(CFGGraph, IRGraph) { ...@@ -210,9 +253,6 @@ TEST(CFGGraph, IRGraph) {
TEST(SortOpLikeDescOrder, NormalTest) { TEST(SortOpLikeDescOrder, NormalTest) {
auto prog = FillProgramDesc(); auto prog = FillProgramDesc();
ir::Graph graph(prog); ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = SortOpLikeDescOrder(graph); auto nodes = SortOpLikeDescOrder(graph);
auto op_descs = prog.Block(0).AllOps(); auto op_descs = prog.Block(0).AllOps();
...@@ -227,9 +267,6 @@ TEST(SortOpLikeDescOrder, NormalTest) { ...@@ -227,9 +267,6 @@ TEST(SortOpLikeDescOrder, NormalTest) {
TEST(SortOpLikeDescOrder, RemoveOpDesc) { TEST(SortOpLikeDescOrder, RemoveOpDesc) {
auto prog = FillProgramDesc(); auto prog = FillProgramDesc();
ir::Graph graph(prog); ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto nodes = graph.Nodes(); auto nodes = graph.Nodes();
auto op_descs = prog.Block(0).AllOps(); auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr; ir::Node* found_node = nullptr;
...@@ -278,8 +315,6 @@ TEST(SortOpLikeDescOrder, RemoveOpDesc) { ...@@ -278,8 +315,6 @@ TEST(SortOpLikeDescOrder, RemoveOpDesc) {
// 3. add some op_desc // 3. add some op_desc
TEST(SortOpLikeDescOrder, AddOpDesc) { TEST(SortOpLikeDescOrder, AddOpDesc) {
auto prog = FillProgramDesc(); auto prog = FillProgramDesc();
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
ir::Graph graph(prog); ir::Graph graph(prog);
auto find_node_in_graph = [&](std::string s) { auto find_node_in_graph = [&](std::string s) {
...@@ -296,9 +331,7 @@ TEST(SortOpLikeDescOrder, AddOpDesc) { ...@@ -296,9 +331,7 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
// cached desc different with real one // cached desc different with real one
// mimic the intermidiete pass modify the programdesc. // mimic the intermidiete pass modify the programdesc.
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
auto op_descs = prog.Block(0).AllOps();
auto op = prog.MutableBlock(0)->AppendOp(); auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
...@@ -330,9 +363,6 @@ TEST(SortOpLikeDescOrder, AddOpDesc) { ...@@ -330,9 +363,6 @@ TEST(SortOpLikeDescOrder, AddOpDesc) {
TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
auto prog = FillProgramDesc(); auto prog = FillProgramDesc();
ir::Graph graph(prog); ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs =
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) { auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr; ir::Node* ret = nullptr;
...@@ -346,8 +376,9 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { ...@@ -346,8 +376,9 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
return ret; return ret;
}; };
std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
// remove sum node // remove sum node
auto op_descs = prog.Block(0).AllOps();
ir::Node* found_node = nullptr; ir::Node* found_node = nullptr;
auto nodes = graph.Nodes(); auto nodes = graph.Nodes();
for (auto node : nodes) { for (auto node : nodes) {
...@@ -408,9 +439,7 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { ...@@ -408,9 +439,7 @@ TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
auto prog = FillProgramDesc(); auto prog = FillProgramDesc();
ir::Graph graph(prog); ir::Graph graph(prog);
const std::vector<OpDesc*>* all_op_descs = std::vector<OpDesc*> op_descs = graph.OriginProgram().Block(0).AllOps();
new std::vector<OpDesc*>(prog.Block(0).AllOps());
graph.Set(details::kAllOpDescs, all_op_descs); // take ownership
auto find_node_in_graph = [&](std::string s) { auto find_node_in_graph = [&](std::string s) {
ir::Node* ret = nullptr; ir::Node* ret = nullptr;
...@@ -424,7 +453,6 @@ TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { ...@@ -424,7 +453,6 @@ TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
return ret; return ret;
}; };
auto op_descs = prog.Block(0).AllOps();
// add node // add node
auto op = prog.MutableBlock(0)->AppendOp(); auto op = prog.MutableBlock(0)->AppendOp();
prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
......
...@@ -69,58 +69,60 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl( ...@@ -69,58 +69,60 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
} }
for (auto& var : op->outputs) { for (auto& var : op->outputs) {
if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 || if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) {
skip_set_.count(var->Name())) VLOG(3) << "Skip set contains variable of " << var->Name()
<< "disable reuse on it. skipped";
continue; continue;
ir::Node* cache = pool_.FindBestFitNode(var);
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
} }
if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
ir::Node* cache = pool_.FindBestFitNode(var);
while (cache != nullptr && var->Name() == cache->Name()) {
VLOG(3) << "The same cache variable is cascade reused. "
<< cache->Name() << " is re-filled to the pool after "
<< "the reused op is finished. Current op can not "
<< "replace it again. Skip this candidate.";
cache = pool_.FindNextBestFitNode(var, cache);
}
if (var->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "start match var " << DebugString(var) << " of op "
<< op->Name();
VLOG(3) << pool_.ToString();
VLOG(3) << "matched in pool : "
<< ((cache == nullptr) ? "False" : "True");
}
if (cache == nullptr) continue; if (cache != nullptr) {
if (var->Name() == cache->Name()) { int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
VLOG(3) << "The same cache variable is cascade reused." << var->Name() VLOG(3) << string::Sprintf(
<< " is re-filled to the pool after" "!!! %s, %s => %s, cache idx %d, pool size %d",
<< "the reused op is finished. Current op can not " std::to_string(reuse_id++), DebugString(var), DebugString(cache),
<< "replace it again. Skip this candidate."; node_idx_in_pool, static_cast<int>(pool_.size()));
continue; // NOTE(dzhwinter): update the ProgramDesc/IR Graph
// and the CFG Graph on the fly.
int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); //
VLOG(3) << string::Sprintf( // IR Graph define the dependence relationship between nodes.
"!!! %s, %s => %s, cache idx %d, pool size %d", //
std::to_string(reuse_id++), DebugString(var), DebugString(cache), // ProgramDesc defines the input/output vars. Its used in
node_idx_in_pool, static_cast<int>(pool_.size())); // CreateOp, CreateVar when running happens.
//
// update CFG Graph on the fly. // CFG Graph store the liveness information, when reuse happens
// reused var maybe re-fill into the pool // we also need to update the variable liveness.
cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); const std::string var_name = var->Name();
// NOTE(dzhwinter): we need to both update the ProgramDesc const std::string cache_name = cache->Name();
// and IR Graph. because op_desc/var_desc is used in CreateOp,
// CreateVar when running happens. But IR Graph
// define the dependence relationship between nodes.
RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
pool_.Erase(cache);
}
// fill the pool cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
std::unordered_set<std::string> unlived_vars; RenameVarInGraphDesc(var_name, cache_name, idx);
for (auto var : cfg_->LiveIn(op)) { RenameVarInGraphNode(var_name, cache_name, idx, graph.get());
if (cfg_->LiveOut(op).count(var) == 0) { pool_.Erase(cache_name);
unlived_vars.emplace(var);
} }
} }
for (auto var : unlived_vars) { }
ir::Node* var_node = cfg_->GetNodeByName(var, op); // fill the pool
if (NodeCanReused(var_node) && !pool_.Has(var_node)) { for (auto& var : cfg_->Unlived(op)) {
pool_.Insert(var_node); ir::Node* var_node = cfg_->GetNodeByName(var, op);
} if (var_node == nullptr || var_node->IsCtrlVar()) continue;
if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
pool_.Insert(var_node);
} }
} }
} }
...@@ -190,7 +192,8 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { ...@@ -190,7 +192,8 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
// effect. Because it is a single op in graph. No need to // effect. Because it is a single op in graph. No need to
// update the ir nodes. // update the ir nodes.
sub_op_desc->Rename(var->Name(), cache->Name()); sub_op_desc->Rename(var->Name(), cache->Name());
if (sub_op_desc->Block()->HasVar(var->Name())) { if (sub_op_desc->Block() != nullptr &&
sub_op_desc->Block()->HasVar(var->Name())) {
sub_op_desc->Block()->RemoveVar(var->Name()); sub_op_desc->Block()->RemoveVar(var->Name());
} }
} }
...@@ -231,7 +234,13 @@ void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, ...@@ -231,7 +234,13 @@ void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
auto* op_desc = op->Op(); auto* op_desc = op->Op();
op_desc->RenameInput(var, cache_var); op_desc->RenameInput(var, cache_var);
op_desc->RenameOutput(var, cache_var); op_desc->RenameOutput(var, cache_var);
if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); if (op_desc->Block() != nullptr) {
op_desc->Block()->RemoveVar(var);
} else {
LOG(WARNING) << "op " << op->Name() << " not know its block."
<< "Is the op_desc created without block pointer? "
<< "Can not find " << var << " in Block(0)";
}
op_desc->Flush(); op_desc->Flush();
} }
} }
...@@ -273,8 +282,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -273,8 +282,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
// redirect the input to the latest version of cache_var // redirect the input to the latest version of cache_var
for (auto* node : op->inputs) { for (auto* node : op->inputs) {
if (node->Name() == var) { if (node->Name() == var) {
ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); ir::Node* cache_node = var_nodes_[cache_var].back();
var_nodes_[cache_var].emplace_back(cache_node);
// swap node to cache_node // swap node to cache_node
cache_node->outputs.insert(cache_node->outputs.end(), cache_node->outputs.insert(cache_node->outputs.end(),
...@@ -283,11 +291,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -283,11 +291,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
auto* prev_op = node->inputs[0]; auto* prev_op = node->inputs[0];
std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
cache_node); cache_node);
cache_node->inputs.emplace_back(prev_op);
for (auto* next_op : node->outputs) { for (auto* next_op : node->outputs) {
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node); cache_node);
} }
// erase unused node
auto& nodes = var_nodes_.at(var);
nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
graph->RemoveNode(node);
} }
} }
...@@ -307,15 +319,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -307,15 +319,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node); cache_node);
} }
// erase unused node
auto& nodes = var_nodes_.at(var);
nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
graph->RemoveNode(node);
} }
} }
} }
// release node of unused var in graph
for (auto* node : var_nodes_[var]) {
graph->RemoveNode(node);
}
var_nodes_.at(var).clear();
} }
} // namespace details } // namespace details
...@@ -324,4 +335,4 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, ...@@ -324,4 +335,4 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
REGISTER_PASS(memory_optimize_pass, REGISTER_PASS(memory_optimize_pass,
paddle::framework::details::MemoryOptimizePass) paddle::framework::details::MemoryOptimizePass)
.RequireGraphAttr(paddle::framework::details::kAllOpDescs); .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
...@@ -392,20 +392,32 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result, ...@@ -392,20 +392,32 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
ir::Graph *result, const std::string &og) const { ir::Graph *result, const std::string &og) const {
OpHandleBase *op_handle = nullptr;
auto append_allreduce_op = [&](
const std::vector<Scope *> &scopes,
const std::vector<platform::Place> &places) -> OpHandleBase * {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle( result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_)); scopes, places, nccl_ctxs_));
#else #else
result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle( result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
local_scopes_, places_)); scopes, places));
#endif #endif
auto *op_handle = result->Get<GraphOps>(kGraphOps).back(); return result->Get<GraphOps>(kGraphOps).back();
};
if (!strategy_.enable_parallel_graph_)
op_handle = append_allreduce_op(local_scopes_, places_);
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i]; if (strategy_.enable_parallel_graph_) {
SetCommunicationContext(op_handle, p); op_handle = append_allreduce_op({local_scopes_[i]}, {places_[i]});
}
SetCommunicationContext(op_handle, places_[i]);
auto &vars = result->Get<GraphVars>(kGraphVars)[i][og]; auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
PADDLE_ENFORCE(!vars.empty()); PADDLE_ENFORCE(!vars.empty());
auto &prev_grad = vars.back(); auto &prev_grad = vars.back();
...@@ -413,7 +425,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( ...@@ -413,7 +425,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
auto var = auto var =
new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable), new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
vars.size(), i, og, p); vars.size(), i, og, places_[i]);
vars.emplace_back(var); vars.emplace_back(var);
op_handle->AddOutput(var); op_handle->AddOutput(var);
} }
...@@ -925,9 +937,21 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, ...@@ -925,9 +937,21 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
} }
void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
if (need_broadcast_var_ || // broad cast received parameters when training in parameter server mode.
(UseGPU() && if (need_broadcast_var_) {
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) { // There are 4 conditions:
// 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS.
// Need to broadcast received parameters to other GPU.
// 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to
// broadcast received parameters to other GPU.
// 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to
// broadcast received parameters to other scope.
// 4. CPU && Reduce: because all parameters share the same memory, did not
// broadcast received parameters.
if (!UseGPU() &&
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
return;
}
if (strategy_.fuse_broadcast_op_) { if (strategy_.fuse_broadcast_op_) {
CreateFusedBroadcastOp(result, bcast_var_name_set_); CreateFusedBroadcastOp(result, bcast_var_name_set_);
} else { } else {
......
...@@ -36,13 +36,14 @@ namespace details { ...@@ -36,13 +36,14 @@ namespace details {
// map from variable name to variables. The variables, who have the same name, // map from variable name to variables. The variables, who have the same name,
// will have a differsent version. The offset in the // will have a differsent version. The offset in the
// `std::vector<VarHandle*>` is the version of varaibles. // `std::vector<VarHandle*>` is the version of varaibles.
typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle*>>> typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
GraphVars; GraphVars;
const char kGraphVars[] = "vars"; const char kGraphVars[] = "vars";
// aux variables to represent dependency. Useful to resolve data hazard. // aux variables to represent dependency. Useful to resolve data hazard.
typedef std::unordered_set<VarHandleBase*> GraphDepVars; typedef std::unordered_set<VarHandleBase *> GraphDepVars;
const char kGraphDepVars[] = "dep_vars"; const char kGraphDepVars[] = "dep_vars";
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -70,6 +70,9 @@ class OpHandleBase { ...@@ -70,6 +70,9 @@ class OpHandleBase {
auto it = dev_ctxes_.find(place); auto it = dev_ctxes_.find(place);
return it != dev_ctxes_.end() ? it->second : nullptr; return it != dev_ctxes_.end() ? it->second : nullptr;
} }
const std::map<platform::Place, platform::DeviceContext *> &DeviceContext() {
return dev_ctxes_;
}
void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) { void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
dev_ctxes_[place] = ctx_; dev_ctxes_[place] = ctx_;
......
...@@ -13,22 +13,85 @@ ...@@ -13,22 +13,85 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
std::vector<std::unique_ptr<ir::Graph>>
ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
std::vector<std::unique_ptr<ir::Graph>> graphs;
graphs.reserve(places_.size());
for (size_t i = 0; i < places_.size(); ++i) {
ProgramDesc empty;
graphs.emplace_back(std::unique_ptr<ir::Graph>(new ir::Graph(empty)));
auto &g = graphs.back();
g->Set(kGraphVars, new GraphVars(1UL));
g->Set(kGraphDepVars, new GraphDepVars);
}
auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
for (auto &op : op_handles) {
auto &dev_ctx = op->DeviceContext();
auto &p = dev_ctx.begin()->first;
int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
for (auto &var : op->Inputs()) {
auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
if (dummy_ptr) {
dev_dummys.insert(var);
if (graph->Nodes().count(var->Node()))
graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
}
}
for (auto &var : op->Outputs()) {
auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
if (dummy_ptr) {
dev_dummys.insert(var);
if (graph->Nodes().count(var->Node()))
graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
}
}
}
for (size_t dev_id = 0; dev_id < places_.size(); ++dev_id) {
auto &dev_vars = graphs[dev_id]->Get<GraphVars>(kGraphVars)[0];
auto &origin_vars = graph->Get<GraphVars>(kGraphVars)[dev_id];
for (auto &name_pair : origin_vars) {
dev_vars.emplace(name_pair.first, name_pair.second);
for (auto &version_pair : name_pair.second) {
if (graph->Nodes().count(version_pair->Node())) {
graphs[dev_id]->AddNode(
graph->RemoveNode(version_pair->Node()).release());
}
}
}
}
return graphs;
}
ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes, const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places, ir::Graph *graph)
std::vector<std::unique_ptr<ir::Graph>> &&graphs)
: strategy_(std::move(strategy)), : strategy_(std::move(strategy)),
local_scopes_(std::move(local_scopes)), local_scopes_(std::move(local_scopes)),
pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
places_(std::move(places)), places_(std::move(places)),
graphs_(std::move(graphs)) { // TODO(Yancey1989): Copying graphs is not safely since it deleted the
// attrs.
graphs_(SeparateMultiDevicesGraph(graph)) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
auto seq_allreduce_pass =
ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
for (size_t i = 0; i < graphs_.size(); ++i) {
graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
}
// set the correct size of thread pool to each device. // set the correct size of thread pool to each device.
strategy_.num_threads_ = strategy_.num_threads_ < places_.size() strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
? 1UL ? 1UL
...@@ -37,7 +100,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( ...@@ -37,7 +100,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
<< " to run the operators of the graph on each device."; << " to run the operators of the graph on each device.";
for (size_t i = 0; i < places.size(); ++i) { for (size_t i = 0; i < places.size(); ++i) {
executors_.emplace_back(new details::ThreadedSSAGraphExecutor( executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get()));
} }
} }
......
...@@ -18,7 +18,9 @@ ...@@ -18,7 +18,9 @@
#include <vector> #include <vector>
#include "ThreadPool.h" #include "ThreadPool.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -29,13 +31,17 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { ...@@ -29,13 +31,17 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
std::vector<std::unique_ptr<ir::Graph>> &&graphs); ir::Graph *graph);
~ParallelSSAGraphExecutor() final = default; ~ParallelSSAGraphExecutor() final = default;
const ir::Graph &Graph() const override { return *graphs_[0]; } const ir::Graph &Graph() const override { return *graphs_[0]; }
FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override; FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
private: private:
std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
ir::Graph *graph);
ExecutionStrategy strategy_; ExecutionStrategy strategy_;
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::unique_ptr<::ThreadPool> pool_{nullptr}; std::unique_ptr<::ThreadPool> pool_{nullptr};
......
...@@ -139,7 +139,7 @@ void ReduceOpHandle::GatherSelectedRows( ...@@ -139,7 +139,7 @@ void ReduceOpHandle::GatherSelectedRows(
#endif #endif
void ReduceOpHandle::RunImpl() { void ReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); platform::RecordEvent record_event(Name());
if (places_.size() == 1) return; if (places_.size() == 1) return;
// the input and output may have dummy var. // the input and output may have dummy var.
...@@ -153,7 +153,7 @@ void ReduceOpHandle::RunImpl() { ...@@ -153,7 +153,7 @@ void ReduceOpHandle::RunImpl() {
{ {
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
"The number of output should be one."); "The number of output should be one.");
out_var_handle = out_var_handles.front(); out_var_handle = out_var_handles.front();
} }
......
...@@ -63,7 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( ...@@ -63,7 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
eptr = std::current_exception(); eptr = std::current_exception();
} }
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun");
++drop_scope_counter_; ++drop_scope_counter_;
bool stream_end = false; bool stream_end = false;
......
...@@ -40,7 +40,7 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl( ...@@ -40,7 +40,7 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
static std::unordered_set<std::string> skip_dist_ops{ static std::unordered_set<std::string> skip_dist_ops{
"send", "recv", "send_barrier", "fetch_barrier"}; "send", "recv", "send_barrier", "fetch_barrier"};
auto &ops = Get<const std::vector<OpDesc *>>(kAllOpDescs); auto &ops = graph->Get<const std::vector<OpDesc *>>(kStaleProgramOpDescs);
std::vector<ir::Node *> op_node_list; std::vector<ir::Node *> op_node_list;
op_node_list.reserve(ops.size()); op_node_list.reserve(ops.size());
...@@ -107,4 +107,4 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl( ...@@ -107,4 +107,4 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
REGISTER_PASS(sequential_execution_pass, REGISTER_PASS(sequential_execution_pass,
paddle::framework::details::SequentialExecutionPass) paddle::framework::details::SequentialExecutionPass)
.RequirePassAttr(paddle::framework::details::kAllOpDescs); .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
...@@ -23,9 +23,8 @@ namespace framework { ...@@ -23,9 +23,8 @@ namespace framework {
namespace details { namespace details {
ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes, const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places, ir::Graph *graph)
std::unique_ptr<ir::Graph> &&graph) : graph_(graph),
: graph_(std::move(graph)),
pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
: nullptr), : nullptr),
local_scopes_(local_scopes), local_scopes_(local_scopes),
...@@ -37,7 +36,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( ...@@ -37,7 +36,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
FeedFetchList ThreadedSSAGraphExecutor::Run( FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) { const std::vector<std::string> &fetch_tensors) {
std::unique_ptr<platform::RecordEvent> event( std::unique_ptr<platform::RecordEvent> event(
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr)); new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
std::unordered_map<OpHandleBase *, size_t> pending_ops; std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars; std::unordered_set<VarHandleBase *> pending_vars;
auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>(); auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
...@@ -110,7 +109,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -110,7 +109,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
for (auto &run_op_future : run_op_futures_) { for (auto &run_op_future : run_op_futures_) {
run_op_future.wait(); run_op_future.wait();
} }
ClearFetchOp(graph_.get(), &fetch_ops); ClearFetchOp(graph_, &fetch_ops);
exception_holder_.ReThrow(); exception_holder_.ReThrow();
} else { } else {
continue; continue;
...@@ -135,7 +134,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -135,7 +134,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
} }
PADDLE_ENFORCE(ready_ops.empty()); PADDLE_ENFORCE(ready_ops.empty());
// Wait FetchOps. // Wait FetchOps.
ClearFetchOp(graph_.get(), &fetch_ops); ClearFetchOp(graph_, &fetch_ops);
return fetch_data; return fetch_data;
} }
...@@ -219,7 +218,7 @@ void ThreadedSSAGraphExecutor::RunOp( ...@@ -219,7 +218,7 @@ void ThreadedSSAGraphExecutor::RunOp(
VLOG(10) << op << " " << op->Name() << " Done "; VLOG(10) << op << " " << op->Name() << " Done ";
running_ops_--; running_ops_--;
ready_var_q->Extend(op->Outputs()); ready_var_q->Extend(op->Outputs());
VLOG(10) << op << " " << op->Name() << "Signal posted"; VLOG(10) << op << " " << op->Name() << " Signal posted";
} catch (...) { } catch (...) {
exception_holder_.Catch(std::current_exception()); exception_holder_.Catch(std::current_exception());
} }
......
...@@ -41,7 +41,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -41,7 +41,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
std::unique_ptr<ir::Graph> &&graph); ir::Graph *graph);
const ir::Graph &Graph() const override { return *graph_; } const ir::Graph &Graph() const override { return *graph_; }
// Run a SSAGraph by a thread pool // Run a SSAGraph by a thread pool
...@@ -55,7 +55,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -55,7 +55,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
details::OpHandleBase *op); details::OpHandleBase *op);
private: private:
std::unique_ptr<ir::Graph> graph_; ir::Graph *graph_;
std::unique_ptr<::ThreadPool> pool_; std::unique_ptr<::ThreadPool> pool_;
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/distributed.h"
......
...@@ -44,6 +44,7 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, ...@@ -44,6 +44,7 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
// Since we want to fetch LodTensor from a variable, the variable must // Since we want to fetch LodTensor from a variable, the variable must
// be created alreadly. // be created alreadly.
Variable* g_fetch_value = scope.FindVar(var_name); Variable* g_fetch_value = scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(g_fetch_value, "%s is not found.", var_name);
PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(), PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
"Only %s can be invoked by GetFetchVariable", "Only %s can be invoked by GetFetchVariable",
typeid(FeedFetchList).name()); typeid(FeedFetchList).name());
......
...@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) { ...@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
op->SetOutput("Out", {"test2_out"}); op->SetOutput("Out", {"test2_out"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64}); prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out"); prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
...@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) { ...@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) {
op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out"); prog.MutableBlock(0)->Var("test2_out");
prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16}); prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
...@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) { ...@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0"); prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
...@@ -267,12 +267,12 @@ TEST(InferInplace, MultiGradInplaceInToOut) { ...@@ -267,12 +267,12 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
prog.MutableBlock(0)->Var("o0"); prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0"); prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0"); prog.MutableBlock(0)->Var("z0");
prog.MutableBlock(0)->Var("a0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("b0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("c0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("o0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("y0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("z0")->SetShape({32, 16}); prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block()); auto in_to_outs = infer_inplace(*op, op->Block());
......
...@@ -102,6 +102,8 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE ...@@ -102,6 +102,8 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
if (WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
endif () endif ()
...@@ -22,7 +22,8 @@ namespace ir { ...@@ -22,7 +22,8 @@ namespace ir {
class AttentionLSTMFusePass : public FusePassBase { class AttentionLSTMFusePass : public FusePassBase {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace ir } // namespace ir
......
...@@ -31,7 +31,8 @@ class ConvAffineChannelFusePass : public FusePassBase { ...@@ -31,7 +31,8 @@ class ConvAffineChannelFusePass : public FusePassBase {
virtual ~ConvAffineChannelFusePass() {} virtual ~ConvAffineChannelFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"conv_affine_channel_fuse"}; const std::string name_scope_{"conv_affine_channel_fuse"};
}; };
...@@ -40,7 +41,8 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { ...@@ -40,7 +41,8 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
virtual ~ConvEltwiseAddAffineChannelFusePass() {} virtual ~ConvEltwiseAddAffineChannelFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
}; };
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册