提交 feb39028 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into imperative_lr_scheduler

无相关合并请求
......@@ -42,12 +42,6 @@ repos:
entry: bash ./tools/codestyle/pylint_pre_commit.hook
language: system
files: \.(py)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang
sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
hooks:
- id: go-fmt
types:
- go
- repo: local
hooks:
- id: copyright_checker
......
......@@ -44,6 +44,7 @@
| qingqing01 | Qing-Qing Dang |
| reyoung | Yang Yu |
| Sand3r- | Michal Gallus |
| sfraczek | Sylwester Fraczek |
| Superjom | Chun-Wei Yan |
| tensor-tang | Jian Tang |
| tianbingsz | Tian-Bing Xu |
......@@ -54,6 +55,7 @@
| wangyang59 | Yang Wang |
| wangzhen-nlp | Zhen Wang |
| wen-bo-yang | Wen-Bo Yang |
| wojtuss | Wojciech Uss |
| wwhu | Wei-Wei Hu |
| xinghai-sun | Xing-Hai Sun |
| Xreki | Yi-Qun Liu |
......
......@@ -25,12 +25,18 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
if(WIN32)
set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
endif(WIN32)
find_package(CUDA QUIET)
......@@ -48,23 +54,12 @@ option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF)
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF)
option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF)
option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF)
option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF)
option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF)
option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_PSLIB "Compile with pslib support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF)
......@@ -99,8 +94,6 @@ endif()
if (WIN32)
set(WITH_DISTRIBUTE OFF CACHE STRING
"Disable DISTRIBUTE when compiling for Windows" FORCE)
set(WITH_FLUID_ONLY ON CACHE STRING
"Enable FLUID_ONLY when compiling for Windows" FORCE)
endif()
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
......@@ -142,7 +135,6 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn
include(external/ngraph) # download, build, install nGraph
include(external/boost) # download boost
include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/cares)
......@@ -219,7 +211,6 @@ include(generic) # simplify cmake module
include(package) # set paddle packages
include(ccache) # set ccache for compilation
include(util) # set unittest and link libs
include(rdma) # set rdma libraries
include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage
include(inference_lib) # add paddle fluid inference libraries
......@@ -227,38 +218,11 @@ include(inference_lib) # add paddle fluid inference libraries
include_directories("${PADDLE_SOURCE_DIR}")
set(EXTERNAL_LIBS
gflags
glog
${CBLAS_LIBRARIES}
protobuf
zlib
${PYTHON_LIBRARIES}
)
if(WITH_PSLIB)
list(APPEND EXTERNAL_LIBS pslib)
list(APPEND EXTERNAL_LIBS pslib_brpc)
list(APPEND EXTERNAL_LIBS libmct)
endif(WITH_PSLIB)
if(WITH_AMD_GPU)
find_package(HIP)
include(hip)
endif(WITH_AMD_GPU)
if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
if(WITH_LIBXSMM)
list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
endif()
if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif()
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
......
......@@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh
# and its size is only one-third of the official one.
# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
tar -xz -C /usr/local && \
RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \
tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \
cp -rf /usr/local/TensorRT/include /usr && \
cp -rf /usr/local/TensorRT/lib /usr
......
# PaddlePaddle
English | [简体中文](./README_cn.md)
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
Welcome to the PaddlePaddle GitHub.
PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use,
......@@ -18,17 +18,7 @@ learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
欢迎来到 PaddlePaddle GitHub
PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。
我们的愿景是让每个人都能通过PaddlePaddle接触深度学习
跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
### Latest PaddlePaddle Release: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
### Install Latest Stable Release:
```
# Linux CPU
......@@ -36,30 +26,13 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.2.0.post87
pip install paddlepaddle-gpu==1.3.0.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.2.0.post85
pip install paddlepaddle-gpu==1.3.0.post85
# For installation on other platform, refer to http://paddlepaddle.org/
```
### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
### 安装最新稳定版本:
```
# Linux CPU
pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.2.0.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.2.0.post85
# 其他平台上的安装指引请参考 http://paddlepaddle.org/
```
## Features
- **Flexibility**
......@@ -100,90 +73,34 @@ pip install paddlepaddle-gpu==1.2.0.post85
Baidu and it has achieved a significant impact. We hope you can also explore
the capability of PaddlePaddle to make an impact on your product.
## 特点
- **灵活性**
PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。
- **高效性**
为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例:
- 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。
- 通过MKL-DNN库优化CNN网络
- 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列
- 针对高维稀疏数据模型,优化了局部和分布式训练。
- **稳定性**
有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。
- **连接产品**
另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。
## Installation
It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.
## 安装
推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html)
It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) on our website.
## Documentation
We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation.
We provide [English](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html) and
[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) documentation.
- [Deep Learning 101](https://github.com/PaddlePaddle/book)
You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.3/user_guides/howto/training/multi_node_en.html)
You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
- [Python API](http://paddlepaddle.org/documentation/docs/en/1.3/api/index_en.html)
Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.3/advanced_usage/development/contribute_to_paddle/index_en.html)
We appreciate your contributions!
## 文档
我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档
- [深度学习101](https://github.com/PaddlePaddle/book)
或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行
- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
可以在MPI集群上运行分布式训练任务
- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
新的API支持代码更少更简洁的程序
- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
欢迎您的贡献!
## Ask Questions
You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
## 答疑
欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交
## Copyright and License
PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
## 版权和许可证
PaddlePaddle由[Apache-2.0 license](LICENSE)提供
# PaddlePaddle
[English](./README.md) | 简体中文
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
欢迎来到 PaddlePaddle GitHub
PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。
我们的愿景是让每个人都能通过PaddlePaddle接触深度学习
跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
### PaddlePaddle最新版本: [Fluid 1.3.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.3)
### 安装最新稳定版本:
```
# Linux CPU
pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.3.0.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.3.0.post85
# 其他平台上的安装指引请参考 http://paddlepaddle.org/
```
## 特性
- **灵活性**
PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。
- **高效性**
为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例:
- 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。
- 通过MKL-DNN库优化CNN网络
- 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列
- 针对高维稀疏数据模型,优化了局部和分布式训练。
- **稳定性**
有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。
- **与产品相连**
另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。
## 安装
推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/install/index_cn.html)
## 文档
我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.3/beginners_guide/index_en.html)
[中文](http://paddlepaddle.org/documentation/docs/zh/1.3/beginners_guide/index.html) 文档
- [深度学习101](https://github.com/PaddlePaddle/book)
或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行
- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.3/user_guides/howto/training/multi_node.html)
可以在MPI集群上运行分布式训练任务
- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.3/api_cn/index_cn.html)
新的API支持代码更少更简洁的程序
- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.3/advanced_usage/development/contribute_to_paddle/index_cn.html)
欢迎您的贡献!
## 答疑
欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交
## 版权和许可证
PaddlePaddle由[Apache-2.0 license](LICENSE)提供
# Benchmark
Machine:
- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
- Laptop: TBD
System: CentOS release 6.3 (Final), Docker 1.12.1.
PaddlePaddle:
- paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN)
- MKL-DNN tag v0.11
- MKLML 2018.0.1.20171007
- paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS)
- OpenBLAS v0.2.20
On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
## Benchmark Model
### Server
#### Training
Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet.
Input image size - 3 * 224 * 224, Time: images/second
- VGG-19
| BatchSize | 64 | 128 | 256 |
|--------------|-------| -----| --------|
| OpenBLAS | 7.80 | 9.00 | 10.80 |
| MKLML | 12.12 | 13.70 | 16.18 |
| MKL-DNN | 28.46 | 29.83 | 30.44 |
<img src="figs/vgg-cpu-train.png" width="500">
- ResNet-50
| BatchSize | 64 | 128 | 256 |
|--------------|-------| ------| -------|
| OpenBLAS | 25.22 | 25.68 | 27.12 |
| MKLML | 32.52 | 31.89 | 33.12 |
| MKL-DNN | 81.69 | 82.35 | 84.08 |
<img src="figs/resnet-cpu-train.png" width="500">
- GoogLeNet
| BatchSize | 64 | 128 | 256 |
|--------------|-------| ------| -------|
| OpenBLAS | 89.52 | 96.97 | 108.25 |
| MKLML | 128.46| 137.89| 158.63 |
| MKL-DNN     | 250.46| 264.83| 269.50 |
<img src="figs/googlenet-cpu-train.png" width="500">
- AlexNet
| BatchSize | 64 | 128 | 256 |
|--------------|--------| ------ | -------|
| OpenBLAS | 45.62 | 72.79 | 107.22 |
| MKLML | 66.37 | 105.60 | 144.04 |
| MKL-DNN | 399.00 | 498.94 | 626.53 |
<img src="figs/alexnet-cpu-train.png" width="500">
#### Inference
Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
- VGG-19
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|-------|-------|-------|-------|-------|
| OpenBLAS | 1.10 | 1.96 | 3.62 | 3.63 | 2.25 |
| MKLML | 5.58 | 9.80 | 15.15 | 21.21 | 28.67 |
| MKL-DNN | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
<img src="figs/vgg-cpu-infer.png" width="500">
- ResNet-50
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|-------|--------|--------|--------|--------|
| OpenBLAS | 3.31 | 6.72 | 11.59 | 13.17 | 9.27 |
| MKLML | 6.33 | 12.02 | 22.88 | 40.53 | 63.09 |
| MKL-DNN | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
<img src="figs/resnet-cpu-infer.png" width="500">
- GoogLeNet
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|--------|--------|--------|--------|--------|
| OpenBLAS | 12.06 | 23.56 | 34.48 | 36.45 | 23.12 |
| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 |
| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
<img src="figs/googlenet-cpu-infer.png" width="500">
- AlexNet
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|--------|--------|--------|--------|--------|
| OpenBLAS | 3.53 | 6.23 | 15.04 | 26.06 | 31.62 |
| MKLML | 21.32 | 36.55 | 73.06 | 131.15 | 192.77 |
| MKL-DNN | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
<img src="figs/alexnet-cpu-infer.png" width="500">
### Laptop
TBD
# Benchmark
Machine:
- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
- GPU: Tesla K40m
- cuDNN: v5.1
- system: Docker 1.12.1, all platforms are tested in docker environment.
Platforms:
- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0
- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu
- Caffe: kaixhin/cuda-caffe
Several convolutional neural networks and recurrent neural networks are used to test.
## Image
### Benchmark Model
AlexNet, GoogleNet and a small network used in Caffe.
- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark.
- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
### Single-GPU
- AlexNet: input - 3 * 227 * 227, Time: ms/batch
| BatchSize | 64 | 128 | 256 | 512 |
|--------------|-----| -----| ------| -----|
| PaddlePaddle | 195 | 334 | 602 | 1629 |
| TensorFlow | 223 | 364 | 645 | 1235 |
| Caffe | 324 | 627 | 1232 | 2513 |
**Notation**
All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
- GoogletNet: input - 3 * 224 * 224, Time: ms/batch
| BatchSize | 64 | 128 | 256 |
|--------------|-------| -------| --------|
| PaddlePaddle | 613 | 1149 | 2348 |
| TensorFlow | 644 | 1176 | 2219 |
| Caffe | 694 | 1364 | out of memory |
- SmallNet: input - 3 * 32 * 32, Time ms/batch
| BatchSize | 64 | 128 | 256 | 512 |
|--------------|--------| -------- | --------|---------|
| PaddlePaddle | 10.463 | 18.184 | 33.113 | 63.039 |
| TensorFlow | 9 | 15 | 28 | 59 |
| Caffe | 9.373 | 16.6606 | 31.4797 | 59.719 |
**Notation**
All the single-GPU experiments in caffe use `caffe time` to calculate elapsed time, which does not include parameter updating time. However, both PaddlePaddle and TensorFlow experiments contain the parameter updating time. As compared with the total time, this part is relatively little on single machine, we can ignore it.
In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
### Multi-GPU: 4 GPUs
- AlexNet, ms / batch
| total-BatchSize | 128 * 4 | 256 * 4 |
|------------------|----------| -----------|
| PaddlePaddle | 347 | 622 |
| TensorFlow | 377 | 675 |
| Caffe | 1229 | 2435 |
For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by
```
time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512
= (334 * 4)/347
= 3.85
```
<img src="figs/alexnet-4gpu.png" width="420">
- GoogleNet, ms / batch
| total-BatchSize | 128 * 4 | 256 * 4 |
|-------------------|--------------| ----------- |
| PaddlePaddle | 1178 | 2367 |
| TensorFlow | 1210 | 2292 |
| Caffe | 2007 | out of memory |
<img src="figs/googlenet-4gpu.png" width="420">
## RNN
We use lstm network for text classfication to test benchmark.
### Dataset
- [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
- Sequence length is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad. Thus, we also pad sequence length to 100 in PaddlePaddle in order to compare.
- Dictionary size=30000
- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
### Single-GPU
#### LSTM in Text Classification
Testing `2 lstm layer + fc` network with different hidden size and batch size.
- Batch size = 64, ms / batch
| hidden_size | 256 | 512 | 1280 |
|--------------|-------| -------| --------|
| PaddlePaddle | 83 | 184 | 641 |
| TensorFlow | 175 | 280 | 818 |
- Batch size = 128, ms / batch
| hidden_size | 256 | 512 | 1280 |
|--------------|------- | -------| --------|
| PaddlePaddle | 110 | 261 | 1007 |
| TensorFlow | 181 | 361 | 1237 |
- Batch size = 256, ms / batch
| hidden_size | 256 | 512 | 1280 |
|--------------|-------| -------| --------|
| PaddlePaddle | 170 | 414 | 1655 |
| TensorFlow | 238 | 536 | 1905 |
<img src="figs/rnn_lstm_cls.png" width="600">
#### Seq2Seq
The benchmark of sequence-to-sequence network will be added later.
### Multi GPU: 4 GPUs
#### LSTM in Text Classification
- hidden_size = 256, ms / batch
| batch_size | 256 | 512 |
|--------------| -------| --------|
| PaddlePaddle | 90 | 118 |
| TensorFlow | 226 | 118 |
- hidden_size = 512, ms / batch
| batch_size | 256 | 512 |
|--------------| -------| --------|
| PaddlePaddle | 189 | 268 |
| TensorFlow | 297 | 383 |
<img src="figs/rnn_lstm_4gpus.png" width="420">
#### Seq2Seq
The benchmark of sequence-to-sequence network will be added later.
......@@ -15,9 +15,6 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
RUN pip uninstall -y paddlepaddle && mkdir /workspace
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
height = 227
width = 227
num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
gp = get_config_arg('layer_num', int, 1)
is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer,
'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
# conv1
net = data_layer('data', size=height * width * 3)
net = img_conv_layer(
input=net,
filter_size=11,
num_channels=3,
num_filters=96,
stride=4,
padding=1)
net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
net = img_pool_layer(input=net, pool_size=3, stride=2)
# conv2
net = img_conv_layer(
input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
net = img_pool_layer(input=net, pool_size=3, stride=2)
# conv3
net = img_conv_layer(
input=net, filter_size=3, num_filters=384, stride=1, padding=1)
# conv4
net = img_conv_layer(
input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
# conv5
net = img_conv_layer(
input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
net = img_pool_layer(input=net, pool_size=3, stride=2)
net = fc_layer(
input=net,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
net = fc_layer(
input=net,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
if is_infer:
outputs(net)
else:
lab = data_layer('label', num_class)
loss = cross_entropy(input=net, label=lab)
outputs(loss)
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 224
width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
use_gpu = get_config_arg('use_gpu', bool, True)
is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer,
'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
conv_projection = conv_projection if use_gpu else img_conv_layer
def inception2(name, input, channels, \
filter1,
filter3R, filter3,
filter5R, filter5,
proj):
conv1 = name + '_1'
conv3r = name + '_3r'
conv3 = name + '_3'
conv5r = name + '_5r'
conv5 = name + '_5'
maxpool = name + '_max'
convproj = name + '_proj'
cov1 = img_conv_layer(
name=conv1,
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter1,
stride=1,
padding=0)
cov3r = img_conv_layer(
name=conv3r,
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter3R,
stride=1,
padding=0)
cov3 = img_conv_layer(
name=conv3,
input=cov3r,
filter_size=3,
num_filters=filter3,
stride=1,
padding=1)
cov5r = img_conv_layer(
name=conv5r,
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter5R,
stride=1,
padding=0)
cov5 = img_conv_layer(
name=conv5,
input=cov5r,
filter_size=5,
num_filters=filter5,
stride=1,
padding=2)
pool1 = img_pool_layer(
name=maxpool,
input=input,
pool_size=3,
num_channels=channels,
stride=1,
padding=1)
covprj = img_conv_layer(
name=convproj,
input=pool1,
filter_size=1,
num_filters=proj,
stride=1,
padding=0)
cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj])
return cat
def inception(name, input, channels, \
filter1,
filter3R, filter3,
filter5R, filter5,
proj):
cov1 = conv_projection(
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter1,
stride=1,
padding=0)
cov3r = img_conv_layer(
name=name + '_3r',
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter3R,
stride=1,
padding=0)
cov3 = conv_projection(
input=cov3r, filter_size=3, num_filters=filter3, stride=1, padding=1)
cov5r = img_conv_layer(
name=name + '_5r',
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter5R,
stride=1,
padding=0)
cov5 = conv_projection(
input=cov5r, filter_size=5, num_filters=filter5, stride=1, padding=2)
pool1 = img_pool_layer(
name=name + '_max',
input=input,
pool_size=3,
num_channels=channels,
stride=1,
padding=1)
covprj = conv_projection(
input=pool1, filter_size=1, num_filters=proj, stride=1, padding=0)
cat = concat_layer(
name=name,
input=[cov1, cov3, cov5, covprj],
bias_attr=True if use_gpu else False,
act=ReluActivation())
return cat
data = data_layer(name="input", size=3 * height * width)
# stage 1
conv1 = img_conv_layer(
name="conv1",
input=data,
filter_size=7,
num_channels=3,
num_filters=64,
stride=2,
padding=3)
pool1 = img_pool_layer(
name="pool1", input=conv1, pool_size=3, num_channels=64, stride=2)
# stage 2
conv2_1 = img_conv_layer(
name="conv2_1",
input=pool1,
filter_size=1,
num_filters=64,
stride=1,
padding=0)
conv2_2 = img_conv_layer(
name="conv2_2",
input=conv2_1,
filter_size=3,
num_filters=192,
stride=1,
padding=1)
pool2 = img_pool_layer(
name="pool2", input=conv2_2, pool_size=3, num_channels=192, stride=2)
# stage 3
ince3a = inception("ince3a", pool2, 192, 64, 96, 128, 16, 32, 32)
ince3b = inception("ince3b", ince3a, 256, 128, 128, 192, 32, 96, 64)
pool3 = img_pool_layer(
name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2)
# stage 4
ince4a = inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64)
ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64)
ince4c = inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64, 64)
ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64)
ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128)
pool4 = img_pool_layer(
name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2)
# stage 5
ince5a = inception("ince5a", pool4, 832, 256, 160, 320, 32, 128, 128)
ince5b = inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128, 128)
pool5 = img_pool_layer(
name="pool5",
input=ince5b,
num_channels=1024,
pool_size=7,
stride=7,
pool_type=AvgPooling())
# We remove loss1 and loss2 for all system when testing benchmark
# output 1
# pool_o1 = img_pool_layer(name="pool_o1", input=ince4a, num_channels=512, pool_size=5, stride=3, pool_type=AvgPooling())
# conv_o1 = img_conv_layer(name="conv_o1", input=pool_o1, filter_size=1, num_filters=128, stride=1, padding=0)
# fc_o1 = fc_layer(name="fc_o1", input=conv_o1, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
# out1 = fc_layer(name="output1", input=fc_o1, size=1000, act=SoftmaxActivation())
# loss1 = cross_entropy(name='loss1', input=out1, label=lab, coeff=0.3)
# output 2
#pool_o2 = img_pool_layer(name="pool_o2", input=ince4d, num_channels=528, pool_size=5, stride=3, pool_type=AvgPooling())
#conv_o2 = img_conv_layer(name="conv_o2", input=pool_o2, filter_size=1, num_filters=128, stride=1, padding=0)
#fc_o2 = fc_layer(name="fc_o2", input=conv_o2, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
#out2 = fc_layer(name="output2", input=fc_o2, size=1000, act=SoftmaxActivation())
#loss2 = cross_entropy(name='loss2', input=out2, label=lab, coeff=0.3)
# output 3
dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
out3 = fc_layer(
name="output3", input=dropout, size=1000, act=SoftmaxActivation())
if is_infer:
outputs(out3)
else:
lab = data_layer(name="label", size=num_class)
loss3 = cross_entropy(name='loss3', input=out3, label=lab)
outputs(loss3)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import argparse
import matplotlib.pyplot as plt
def parse_args():
parser = argparse.ArgumentParser('Parse Log')
parser.add_argument(
'--file_path', '-f', type=str, help='the path of the log file')
parser.add_argument(
'--sample_rate',
'-s',
type=float,
default=1.0,
help='the rate to take samples from log')
parser.add_argument(
'--log_period', '-p', type=int, default=1, help='the period of log')
args = parser.parse_args()
return args
def parse_file(file_name):
loss = []
error = []
with open(file_name) as f:
for i, line in enumerate(f):
line = line.strip()
if not line.startswith('pass'):
continue
line_split = line.split(' ')
if len(line_split) != 5:
continue
loss_str = line_split[2][:-1]
cur_loss = float(loss_str.split('=')[-1])
loss.append(cur_loss)
err_str = line_split[3][:-1]
cur_err = float(err_str.split('=')[-1])
error.append(cur_err)
accuracy = [1.0 - err for err in error]
return loss, accuracy
def sample(metric, sample_rate):
interval = int(1.0 / sample_rate)
if interval > len(metric):
return metric[:1]
num = len(metric) / interval
idx = [interval * i for i in range(num)]
metric_sample = [metric[id] for id in idx]
return metric_sample
def plot_metric(metric,
batch_id,
graph_title,
line_style='b-',
line_label='y',
line_num=1):
plt.figure()
plt.title(graph_title)
if line_num == 1:
plt.plot(batch_id, metric, line_style, label=line_label)
else:
for i in range(line_num):
plt.plot(batch_id, metric[i], line_style[i], label=line_label[i])
plt.xlabel('batch')
plt.ylabel(graph_title)
plt.legend()
plt.savefig(graph_title + '.jpg')
plt.close()
def main():
args = parse_args()
assert args.sample_rate > 0. and args.sample_rate <= 1.0, "The sample rate should in the range (0, 1]."
loss, accuracy = parse_file(args.file_path)
batch = [args.log_period * i for i in range(len(loss))]
batch_sample = sample(batch, args.sample_rate)
loss_sample = sample(loss, args.sample_rate)
accuracy_sample = sample(accuracy, args.sample_rate)
plot_metric(loss_sample, batch_sample, 'loss', line_label='loss')
plot_metric(
accuracy_sample,
batch_sample,
'accuracy',
line_style='g-',
line_label='accuracy')
if __name__ == '__main__':
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io, os
import random
import numpy as np
from paddle.trainer.PyDataProvider2 import *
def initHook(settings, height, width, color, num_class, **kwargs):
settings.height = height
settings.width = width
settings.color = color
settings.num_class = num_class
if settings.color:
settings.data_size = settings.height * settings.width * 3
else:
settings.data_size = settings.height * settings.width
settings.is_infer = kwargs.get('is_infer', False)
settings.num_samples = kwargs.get('num_samples', 2560)
if settings.is_infer:
settings.slots = [dense_vector(settings.data_size)]
else:
settings.slots = [dense_vector(settings.data_size), integer_value(1)]
@provider(
init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_list):
for i in xrange(settings.num_samples):
img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
if settings.is_infer:
yield img.astype('float32')
else:
lab = random.randint(0, settings.num_class - 1)
yield img.astype('float32'), int(lab)
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 224
width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg("layer_num", int, 50)
is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer,
'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
#######################Network Configuration #############
def conv_bn_layer(name,
input,
filter_size,
num_filters,
stride,
padding,
channels=None,
active_type=ReluActivation()):
"""
A wrapper for conv layer with batch normalization layers.
Note:
conv layer has no activation.
"""
tmp = img_conv_layer(
name=name + "_conv",
input=input,
filter_size=filter_size,
num_channels=channels,
num_filters=num_filters,
stride=stride,
padding=padding,
act=LinearActivation(),
bias_attr=False)
return batch_norm_layer(
name=name + "_bn",
input=tmp,
act=active_type,
use_global_stats=is_infer)
def bottleneck_block(name, input, num_filters1, num_filters2):
"""
A wrapper for bottlenect building block in ResNet.
Last conv_bn_layer has no activation.
Addto layer has activation of relu.
"""
last_name = conv_bn_layer(
name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=1,
padding=0)
last_name = conv_bn_layer(
name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(
name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(
name=name + "_addto", input=[input, last_name], act=ReluActivation())
def mid_projection(name, input, num_filters1, num_filters2, stride=2):
"""
A wrapper for middile projection in ResNet.
projection shortcuts are used for increasing dimensions,
and other shortcuts are identity
branch1: projection shortcuts are used for increasing
dimensions, has no activation.
branch2x: bottleneck building block, shortcuts are identity.
"""
# stride = 2
branch1 = conv_bn_layer(
name=name + '_branch1',
input=input,
filter_size=1,
num_filters=num_filters2,
stride=stride,
padding=0,
active_type=LinearActivation())
last_name = conv_bn_layer(
name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=stride,
padding=0)
last_name = conv_bn_layer(
name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(
name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(
name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
img = data_layer(name='image', size=height * width * 3)
def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
"""
A wrapper for 50,101,152 layers of ResNet.
res2_num: number of blocks stacked in conv2_x
res3_num: number of blocks stacked in conv3_x
res4_num: number of blocks stacked in conv4_x
res5_num: number of blocks stacked in conv5_x
"""
# For ImageNet
# conv1: 112x112
tmp = conv_bn_layer(
"conv1",
input=img,
filter_size=7,
channels=3,
num_filters=64,
stride=2,
padding=3)
tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
# conv2_x: 56x56
tmp = mid_projection(
name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
for i in xrange(2, res2_num + 1, 1):
tmp = bottleneck_block(
name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
# conv3_x: 28x28
tmp = mid_projection(
name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
for i in xrange(2, res3_num + 1, 1):
tmp = bottleneck_block(
name="res3_" + str(i),
input=tmp,
num_filters1=128,
num_filters2=512)
# conv4_x: 14x14
tmp = mid_projection(
name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
for i in xrange(2, res4_num + 1, 1):
tmp = bottleneck_block(
name="res4_" + str(i),
input=tmp,
num_filters1=256,
num_filters2=1024)
# conv5_x: 7x7
tmp = mid_projection(
name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
for i in xrange(2, res5_num + 1, 1):
tmp = bottleneck_block(
name="res5_" + str(i),
input=tmp,
num_filters1=512,
num_filters2=2048)
tmp = img_pool_layer(
name='avgpool',
input=tmp,
pool_size=7,
stride=1,
pool_type=AvgPooling())
return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
if layer_num == 50:
resnet = deep_res_net(3, 4, 6, 3)
elif layer_num == 101:
resnet = deep_res_net(3, 4, 23, 3)
elif layer_num == 152:
resnet = deep_res_net(3, 8, 36, 3)
else:
print("Wrong layer number.")
if is_infer:
outputs(resnet)
else:
lbl = data_layer(name="label", size=num_class)
loss = cross_entropy(name='loss', input=resnet, label=lbl)
outputs(loss)
#!/bin/bash
set -e
function train() {
cfg=$1
thread=$2
bz=$3
args="batch_size=$3"
prefix=$4
paddle train --job=time \
--config=$cfg \
--use_gpu=True \
--trainer_count=$thread \
--log_period=10 \
--test_period=100 \
--config_args=$args \
> logs/$prefix-${thread}gpu-$bz.log 2>&1
}
if [ ! -d "train.list" ]; then
echo " " > train.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
#========single-gpu=========#
# alexnet
train alexnet.py 1 64 alexnet
train alexnet.py 1 128 alexnet
train alexnet.py 1 256 alexnet
train alexnet.py 1 512 alexnet
# googlenet
train googlenet.py 1 64 googlenet
train googlenet.py 1 128 googlenet
train googlenet.py 1 256 googlenet
# smallnet
train smallnet_mnist_cifar.py 1 64 smallnet
train smallnet_mnist_cifar.py 1 128 smallnet
train smallnet_mnist_cifar.py 1 256 smallnet
train smallnet_mnist_cifar.py 1 512 smallnet
############################
#========multi-gpus=========#
train alexnet.py 4 512 alexnet
train alexnet.py 4 1024 alexnet
train googlenet.py 4 512 googlenet
train googlenet.py 4 1024 googlenet
#!/bin/bash
set -e
function clock_to_seconds() {
hours=`echo $1 | awk -F ':' '{print $1}'`
mins=`echo $1 | awk -F ':' '{print $2}'`
secs=`echo $1 | awk -F ':' '{print $3}'`
echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
}
function infer() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
topology=$1
layer_num=$2
bs=$3
use_mkldnn=$4
if [ $4 == "True" ]; then
thread=1
log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
elif [ $4 == "False" ]; then
thread=`nproc`
if [ $thread -gt $bs ]; then
thread=$bs
fi
log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
else
echo "Wrong input $4, use True or False."
exit 0
fi
models_in="models/${topology}-${layer_num}/pass-00000/"
if [ ! -d $models_in ]; then
echo "Training model ${topology}_${layer_num}"
paddle train --job=train \
--config="${topology}.py" \
--use_mkldnn=True \
--use_gpu=False \
--trainer_count=1 \
--num_passes=1 \
--save_dir="models/${topology}-${layer_num}" \
--config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
> /dev/null 2>&1
echo "Done"
fi
log_period=$((256 / bs))
paddle train --job=test \
--config="${topology}.py" \
--use_mkldnn=$use_mkldnn \
--use_gpu=False \
--trainer_count=$thread \
--log_period=$log_period \
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
--init_model_path=$models_in \
2>&1 | tee ${log}
# calculate the last 5 logs period time of 1280 samples,
# the time before are burning time.
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
start_sec=`clock_to_seconds $start`
end_sec=`clock_to_seconds $end`
fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
echo " " > train.list
fi
if [ ! -f "test.list" ]; then
echo " " > test.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
if [ ! -d "models" ]; then
mkdir -p models
fi
# inference benchmark
for use_mkldnn in True False; do
for batchsize in 1 2 4 8 16; do
infer vgg 19 $batchsize $use_mkldnn
infer resnet 50 $batchsize $use_mkldnn
infer googlenet v1 $batchsize $use_mkldnn
infer alexnet 2 $batchsize $use_mkldnn
done
done
#!/bin/bash
set -e
function train() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
topology=$1
layer_num=$2
bs=$3
use_mkldnn=$4
if [ $4 == "True" ]; then
thread=1
log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
elif [ $4 == "False" ]; then
thread=`nproc`
# each trainer_count use only 1 core to avoid conflict
log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
else
echo "Wrong input $4, use True or False."
exit 0
fi
args="batch_size=${bs},layer_num=${layer_num}"
config="${topology}.py"
paddle train --job=time \
--config=$config \
--use_mkldnn=$use_mkldnn \
--use_gpu=False \
--trainer_count=$thread \
--log_period=10 \
--test_period=100 \
--config_args=$args \
2>&1 | tee ${log}
avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
echo " " > train.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
# training benchmark
for use_mkldnn in True False; do
for batchsize in 64 128 256; do
train vgg 19 $batchsize $use_mkldnn
train resnet 50 $batchsize $use_mkldnn
train googlenet v1 $batchsize $use_mkldnn
train alexnet 2 $batchsize $use_mkldnn
done
done
#!/bin/bash
set -e
function clock_to_seconds() {
hours=`echo $1 | awk -F ':' '{print $1}'`
mins=`echo $1 | awk -F ':' '{print $2}'`
secs=`echo $1 | awk -F ':' '{print $3}'`
echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
}
function infer() {
export OPENBLAS_MAIN_FREE=1
topology=$1
layer_num=$2
bs=$3
trainers=`nproc`
if [ $trainers -gt $bs ]; then
trainers=$bs
fi
log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
threads=$((`nproc` / trainers))
if [ $threads -eq 0 ]; then
threads=1
fi
export OPENBLAS_NUM_THREADS=$threads
models_in="models/${topology}-${layer_num}/pass-00000/"
if [ ! -d $models_in ]; then
echo "./run_mkl_infer.sh to save the model first"
exit 0
fi
log_period=$((32 / bs))
paddle train --job=test \
--config="${topology}.py" \
--use_mkldnn=False \
--use_gpu=False \
--trainer_count=$trainers \
--log_period=$log_period \
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
--init_model_path=$models_in \
2>&1 | tee ${log}
# calculate the last 5 logs period time of 160(=32*5) samples,
# the time before are burning time.
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
start_sec=`clock_to_seconds $start`
end_sec=`clock_to_seconds $end`
fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
echo " " > train.list
fi
if [ ! -f "test.list" ]; then
echo " " > test.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
# inference benchmark
for batchsize in 1 2 4 8 16; do
infer vgg 19 $batchsize
infer resnet 50 $batchsize
infer googlenet v1 $batchsize
infer alexnet 2 $batchsize
done
#!/bin/bash
set -e
function train() {
export OPENBLAS_NUM_THREADS=1
topology=$1
layer_num=$2
bs=$3
thread=`nproc`
# each trainer_count use only 1 core to avoid conflict
log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
args="batch_size=${bs},layer_num=${layer_num}"
config="${topology}.py"
paddle train --job=time \
--config=$config \
--use_mkldnn=False \
--use_gpu=False \
--trainer_count=$thread \
--log_period=3 \
--test_period=30 \
--config_args=$args \
2>&1 | tee ${log}
avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
echo " " > train.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
# training benchmark
for batchsize in 64 128 256; do
train vgg 19 $batchsize
train resnet 50 $batchsize
train googlenet v1 $batchsize
train alexnet 2 $batchsize
done
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 32
width = 32
num_class = 10
batch_size = get_config_arg('batch_size', int, 128)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
# conv1
net = data_layer('data', size=height * width * 3)
net = img_conv_layer(
input=net,
filter_size=5,
num_channels=3,
num_filters=32,
stride=1,
padding=2)
net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1)
# conv2
net = img_conv_layer(
input=net, filter_size=5, num_filters=32, stride=1, padding=2)
net = img_pool_layer(
input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
# conv3
net = img_conv_layer(
input=net, filter_size=3, num_filters=64, stride=1, padding=1)
net = img_pool_layer(
input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
net = fc_layer(input=net, size=64, act=ReluActivation())
net = fc_layer(input=net, size=10, act=SoftmaxActivation())
lab = data_layer('label', num_class)
loss = classification_cost(input=net, label=lab)
outputs(loss)
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 224
width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg('layer_num', int, 19)
is_infer = get_config_arg("is_infer", bool, False)
num_samples = get_config_arg('num_samples', int, 2560)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer,
'num_samples': num_samples
}
define_py_data_sources2(
"train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings(
batch_size=batch_size,
learning_rate=0.001 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
img = data_layer(name='image', size=height * width * 3)
def vgg_network(vgg_num=3):
tmp = img_conv_group(
input=img,
num_channels=3,
conv_padding=1,
conv_num_filter=[64, 64],
conv_filter_size=3,
conv_act=ReluActivation(),
pool_size=2,
pool_stride=2,
pool_type=MaxPooling())
tmp = img_conv_group(
input=tmp,
conv_num_filter=[128, 128],
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
channels = []
for i in range(vgg_num):
channels.append(256)
tmp = img_conv_group(
input=tmp,
conv_num_filter=channels,
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
channels = []
for i in range(vgg_num):
channels.append(512)
tmp = img_conv_group(
input=tmp,
conv_num_filter=channels,
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
tmp = img_conv_group(
input=tmp,
conv_num_filter=channels,
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
tmp = fc_layer(
input=tmp,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
tmp = fc_layer(
input=tmp,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
if layer_num == 16:
vgg = vgg_network(3)
elif layer_num == 19:
vgg = vgg_network(4)
else:
print("Wrong layer number.")
if is_infer:
outputs(vgg)
else:
lab = data_layer('label', num_class)
loss = cross_entropy(input=vgg, label=lab)
outputs(loss)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io, os
import random
import numpy as np
import six.moves.cPickle as pickle
from paddle.trainer.PyDataProvider2 import *
def remove_unk(x, n_words):
return [[1 if w >= n_words else w for w in sen] for sen in x]
# ==============================================================
# tensorflow uses fixed length, but PaddlePaddle can process
# variable-length. Padding is used in benchmark in order to
# compare with other platform.
# ==============================================================
def pad_sequences(sequences,
maxlen=None,
dtype='int32',
padding='post',
truncating='post',
value=0.):
lengths = [len(s) for s in sequences]
nb_samples = len(sequences)
if maxlen is None:
maxlen = np.max(lengths)
x = (np.ones((nb_samples, maxlen)) * value).astype(dtype)
for idx, s in enumerate(sequences):
if len(s) == 0:
continue # empty list was found
if truncating == 'pre':
trunc = s[-maxlen:]
elif truncating == 'post':
trunc = s[:maxlen]
else:
raise ValueError("Truncating type '%s' not understood" % padding)
if padding == 'post':
x[idx, :len(trunc)] = trunc
elif padding == 'pre':
x[idx, -len(trunc):] = trunc
else:
raise ValueError("Padding type '%s' not understood" % padding)
return x
def initHook(settings, vocab_size, pad_seq, maxlen, **kwargs):
settings.vocab_size = vocab_size
settings.pad_seq = pad_seq
settings.maxlen = maxlen
settings.input_types = [
integer_value_sequence(vocab_size), integer_value(2)
]
@provider(
init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file):
f = open(file, 'rb')
train_set = pickle.load(f)
f.close()
x, y = train_set
# remove unk, namely remove the words out of dictionary
x = remove_unk(x, settings.vocab_size)
if settings.pad_seq:
x = pad_sequences(x, maxlen=settings.maxlen, value=0.)
for i in range(len(y)):
yield map(int, x[i]), int(y[i])
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
import imdb
num_class = 2
vocab_size = 30000
fixedlen = 100
batch_size = get_config_arg('batch_size', int, 128)
lstm_num = get_config_arg('lstm_num', int, 1)
hidden_size = get_config_arg('hidden_size', int, 128)
# whether to pad sequence into fixed length
pad_seq = get_config_arg('pad_seq', bool, True)
imdb.create_data('imdb.pkl')
args = {'vocab_size': vocab_size, 'pad_seq': pad_seq, 'maxlen': fixedlen}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
net = data_layer('data', size=vocab_size)
net = embedding_layer(input=net, size=128)
for i in xrange(lstm_num):
net = simple_lstm(input=net, size=hidden_size)
net = last_seq(input=net)
net = fc_layer(input=net, size=2, act=SoftmaxActivation())
lab = data_layer('label', num_class)
loss = classification_cost(input=net, label=lab)
outputs(loss)
#!/bin/bash
set -e
function train() {
cfg=$1
thread=$2
args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}"
paddle train --job=time \
--config=$cfg \
--use_gpu=1 \
--trainer_count=$thread \
--log_period=10 \
--test_period=100 \
--num_passes=1 \
--feed_data=1 \
--config_args=$args \
>logs/rnn-pad${4}-${thread}gpu-lstm${3}-batch${6}-hid${5}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
## padding, single gpu
#-----config--gpu--lstm_num--padding--hidden_size--batch_size
## lstm_num=2, batch_size=64
train rnn.py 1 2 1 256 64
train rnn.py 1 2 1 512 64
train rnn.py 1 2 1 1280 64
## lstm_num=2, batch_size=128
train rnn.py 1 2 1 256 128
train rnn.py 1 2 1 512 128
train rnn.py 1 2 1 1280 128
## lstm_num=4, batch_size=256
train rnn.py 1 2 1 256 256
train rnn.py 1 2 1 512 256
train rnn.py 1 2 1 1280 256
#==================multi gpus=====================#
# hidden_size=256, lstm_num=2, different batch size
train rnn.py 4 2 1 256 128
train rnn.py 4 2 1 256 256
train rnn.py 4 2 1 256 512
# hidden_size=512, lstm_num=4, different batch size
train rnn.py 4 2 1 512 128
train rnn.py 4 2 1 512 256
train rnn.py 4 2 1 512 512
......@@ -35,8 +35,6 @@ import os
import argparse
import time
import paddle.v2 as paddle
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--embedding_dim",
......
......@@ -21,7 +21,6 @@ import time
import numpy as np
import tensorflow as tf
import paddle.v2 as paddle
DTYPE = tf.float32
......
......@@ -27,7 +27,6 @@ import argparse
import time
import numpy as np
import paddle.v2 as paddle
import tensorflow as tf
DTYPE = tf.float32
......
......@@ -21,8 +21,6 @@ import argparse
import time
import tensorflow as tf
import paddle.v2 as paddle
def parse_args():
parser = argparse.ArgumentParser("LSTM model benchmark.")
......
......@@ -13,7 +13,6 @@
# limitations under the License.
"""VGG16 benchmark in TensorFlow"""
import tensorflow as tf
import paddle.v2 as paddle
import numpy as np
import argparse
import time
......
......@@ -20,31 +20,10 @@ if(WITH_DSO)
add_definitions(-DPADDLE_USE_DSO)
endif(WITH_DSO)
if(WITH_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE)
endif(WITH_DOUBLE)
if(WITH_ARM_FP16)
add_definitions(-DPADDLE_ARM_FP16)
add_definitions("-march=armv8.2-a+fp16+simd")
endif(WITH_ARM_FP16)
if(WITH_TESTING)
add_definitions(-DPADDLE_WITH_TESTING)
endif(WITH_TESTING)
if(NOT WITH_TIMER)
add_definitions(-DPADDLE_DISABLE_TIMER)
endif(NOT WITH_TIMER)
if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS)
if(EIGEN_USE_THREADS)
add_definitions(-DEIGEN_USE_THREADS)
endif(EIGEN_USE_THREADS)
if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
......@@ -78,10 +57,6 @@ if(WIN32)
endif(NOT MSVC)
endif(WIN32)
if(NOT WITH_GOLANG)
add_definitions(-DPADDLE_WITHOUT_GOLANG)
endif(NOT WITH_GOLANG)
if(WITH_PSLIB)
add_definitions(-DPADDLE_WITH_PSLIB)
endif()
......@@ -152,7 +127,12 @@ endif()
if (WITH_MKLML AND MKLML_IOMP_LIB)
message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
set(OPENMP_FLAGS "-fopenmp")
if(WIN32)
# openmp not support well for now on windows
set(OPENMP_FLAGS "")
else(WIN32)
set(OPENMP_FLAGS "-fopenmp")
endif(WIN32)
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
......@@ -166,55 +146,6 @@ if(WITH_DISTRIBUTE)
add_definitions(-DPADDLE_WITH_DISTRIBUTE)
endif()
if(WITH_GOLANG)
# we need to symlink Paddle directory into GOPATH. If we
# don't do it and we have code that depends on Paddle, go
# get ./... will download a new Paddle repo from Github,
# without the changes in our current Paddle repo that we
# want to build.
set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
file(MAKE_DIRECTORY ${GOPATH})
set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
file(MAKE_DIRECTORY "${PADDLE_IN_GOPATH}")
set(PADDLE_GO_PATH "${CMAKE_SOURCE_DIR}/go")
add_custom_target(go_path)
add_custom_command(TARGET go_path
# Symlink Paddle directory into GOPATH
COMMAND mkdir -p ${PADDLE_IN_GOPATH}
COMMAND rm -rf ${PADDLE_IN_GOPATH}
COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
# Automatically get all dependencies specified in the source code
# We can't run `go get -d ./...` for every target, because
# multiple `go get` can not run concurrently, but make need to be
# able to run with multiple jobs.
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
if (GLIDE_INSTALL)
if(EXISTS $ENV{GOPATH}/bin/glide)
set(GLIDE "$ENV{GOPATH}/bin/glide")
else()
message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
endif()
# this command will only run when the file it depends is missing
# or has changed, or the output is missing.
add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
COMMAND env GOPATH=${GOPATH} ${GLIDE} install
COMMAND touch ${CMAKE_BINARY_DIR}/glide
DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
)
# depends on the custom command which outputs
# ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
# run every time this target is built.
add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
endif()
endif(WITH_GOLANG)
if(WITH_GRPC)
add_definitions(-DPADDLE_WITH_GRPC)
endif(WITH_GRPC)
......
......@@ -168,10 +168,7 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
endif()
include_directories(${CUDA_INCLUDE_DIRS})
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
# TODO(panyx0718): CUPTI only allows DSO?
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
if(WIN32)
set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
endif(WIN32)
......@@ -203,25 +200,26 @@ list(APPEND CUDA_NVCC_FLAGS "-w")
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
if (NOT WIN32)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
# nvcc 9 does not support -Os. Use Release flags instead
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif()
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
# nvcc 9 does not support -Os. Use Release flags instead
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif()
else(NOT WIN32)
list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS "-g -G")
# match the cl's _ITERATOR_DEBUG_LEVEL
list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG")
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
else()
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"")
list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS "-g -G")
# match the cl's _ITERATOR_DEBUG_LEVEL
list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG")
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
else()
message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
endif()
endif(NOT WIN32)
......
......@@ -74,5 +74,3 @@ add_dependencies(anakin_shared extern_anakin)
add_library(anakin_saber SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
add_dependencies(anakin_saber extern_anakin)
list(APPEND external_project_dependencies anakin_shared anakin_saber)
INCLUDE(ExternalProject)
SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
ExternalProject_Add(
extern_lib_any
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git"
GIT_TAG "15595d8324be9e8a9a80d9ae442fdd12bd66df5d"
PREFIX ${ANY_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
add_library(lib_any STATIC ${dummyfile})
else()
add_library(lib_any INTERFACE)
endif()
add_dependencies(lib_any extern_lib_any)
add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
LIST(APPEND external_project_dependencies lib_any)
......@@ -57,5 +57,4 @@ else()
endif()
add_dependencies(boost ${BOOST_PROJECT})
list(APPEND external_project_dependencies boost)
set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
......@@ -69,5 +69,3 @@ SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
ADD_DEPENDENCIES(brpc extern_brpc)
add_definitions(-DBRPC_WITH_GLOG)
LIST(APPEND external_project_dependencies brpc)
......@@ -31,5 +31,3 @@ else()
endif()
add_dependencies(cub extern_cub)
LIST(APPEND external_project_dependencies cub)
......@@ -27,5 +27,3 @@ else()
endif()
add_dependencies(dlpack extern_dlpack)
LIST(APPEND external_project_dependencies dlpack)
......@@ -52,5 +52,3 @@ else()
endif()
add_dependencies(eigen3 extern_eigen3)
LIST(APPEND external_project_dependencies eigen3)
......@@ -61,8 +61,6 @@ ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
ADD_DEPENDENCIES(gflags extern_gflags)
LIST(APPEND external_project_dependencies gflags)
# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
if (WIN32)
include(CheckIncludeFileCXX)
......
......@@ -20,8 +20,10 @@ SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include dire
IF(WIN32)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
ELSE(WIN32)
SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
......@@ -39,7 +41,7 @@ ExternalProject_Add(
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
......@@ -70,5 +72,3 @@ ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
ADD_DEPENDENCIES(glog extern_glog gflags)
LINK_LIBRARIES(glog gflags)
LIST(APPEND external_project_dependencies glog)
......@@ -79,5 +79,4 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
ADD_DEPENDENCIES(gtest_main extern_gtest)
LIST(APPEND external_project_dependencies gtest gtest_main)
ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
......@@ -39,6 +39,3 @@ ADD_DEPENDENCIES(extern_leveldb snappy)
ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
ADD_DEPENDENCIES(leveldb extern_leveldb)
LIST(APPEND external_project_dependencies leveldb)
......@@ -72,7 +72,4 @@ else()
add_library(libmct INTERFACE)
endif()
#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL)
ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT})
LIST(APPEND external_project_dependencies libmct)
......@@ -53,5 +53,3 @@ MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
include_directories(${LIBXSMM_INCLUDE_DIR})
ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
ADD_DEPENDENCIES(libxsmm extern_libxsmm)
LIST(APPEND external_project_dependencies libxsmm)
......@@ -31,9 +31,17 @@ IF(APPLE)
return()
ENDIF()
MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
# Introduce variables:
# * CMAKE_INSTALL_LIBDIR
INCLUDE(GNUInstallDirs)
SET(LIBDIR "lib")
if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$")
SET(LIBDIR "lib64")
endif()
MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/l${LIBDIR} to runtime path")
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR}")
INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
......@@ -49,6 +57,8 @@ IF(NOT WIN32)
SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
ELSE()
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
ENDIF(NOT WIN32)
ExternalProject_Add(
......@@ -56,12 +66,11 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git"
GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a"
GIT_TAG "863ff6e7042cec7d2e29897fe9f0872e0888b0fc"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
......@@ -78,9 +87,9 @@ ExternalProject_Add(
-DMKLROOT:PATH=${MKLML_ROOT}
)
if(WIN32)
SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
else(WIN32)
SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
endif(WIN32)
ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
......@@ -88,7 +97,6 @@ SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_WITH_MKLDNN)
LIST(APPEND external_project_dependencies shared_mkldnn)
# generate a static dummy target to track mkldnn dependencies
# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
......@@ -101,7 +109,7 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
# copy the real so.0 lib to install dir
# it can be directly contained in wheel or capi
if(WIN32)
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/lib/mkldnn.dll)
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll)
else(WIN32)
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
......
......@@ -39,8 +39,10 @@ IF(WIN32)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll)
ELSE()
SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
ELSE()
#TODO(intel-huying):
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
......@@ -73,4 +75,3 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
LIST(APPEND external_project_dependencies mklml)
......@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph")
SET(NGRAPH_GIT_TAG "20bd8bbc79ae3a81c57313846a2be7313e5d1dab")
SET(NGRAPH_GIT_TAG "a444f7a959b7d87f2c117c9b57a4c387759e481e")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
......@@ -69,7 +69,7 @@ ExternalProject_Add(
CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}
CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
)
......@@ -77,4 +77,3 @@ add_dependencies(ngraph ${NGRAPH_PROJECT})
target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH)
target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR})
target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB})
LIST(APPEND external_project_dependencies ngraph)
......@@ -11,11 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(USE_EIGEN_FOR_BLAS)
return()
ENDIF(USE_EIGEN_FOR_BLAS)
INCLUDE(cblas)
IF(NOT ${CBLAS_FOUND})
......@@ -91,7 +86,6 @@ ENDIF()
IF(NOT ${CBLAS_FOUND})
ADD_DEPENDENCIES(cblas extern_openblas)
LIST(APPEND external_project_dependencies cblas)
ELSE()
IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
ADD_DEPENDENCIES(cblas mklml)
......
......@@ -129,7 +129,6 @@ macro(PROMPT_PROTOBUF_LIB)
ADD_DEPENDENCIES(protoc ${dep})
ENDFOREACH()
LIST(APPEND external_project_dependencies protobuf)
RETURN()
endmacro()
macro(SET_PROTOBUF_VERSION)
......@@ -231,7 +230,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
)
ENDFUNCTION()
SET(PROTOBUF_VERSION 3.1)
SET(PROTOBUF_VERSION 3.1.0)
IF(NOT PROTOBUF_FOUND)
build_protobuf(extern_protobuf FALSE)
......
......@@ -70,4 +70,3 @@ ExternalProject_Add(
ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
LIST(APPEND external_project_dependencies pslib)
......@@ -70,4 +70,3 @@ ExternalProject_Add(
ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
LIST(APPEND external_project_dependencies pslib_brpc)
......@@ -20,6 +20,12 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
if(WIN32)
SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
else()
SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
endif()
ExternalProject_Add(
extern_snappy
GIT_REPOSITORY "https://github.com/google/snappy"
......@@ -31,7 +37,7 @@ ExternalProject_Add(
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
......
......@@ -26,5 +26,3 @@ else()
endif()
add_dependencies(simple_threadpool extern_threadpool)
LIST(APPEND external_project_dependencies simple_threadpool)
......@@ -83,5 +83,3 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include wa
ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
ADD_DEPENDENCIES(warpctc extern_warpctc)
LIST(APPEND external_project_dependencies warpctc)
......@@ -55,4 +55,3 @@ else()
endif()
add_dependencies(xbyak ${XBYAK_PROJECT})
list(APPEND external_project_dependencies xbyak)
......@@ -71,5 +71,3 @@ add_library(xxhash STATIC IMPORTED GLOBAL)
set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
include_directories(${XXHASH_INCLUDE_DIR})
add_dependencies(xxhash extern_xxhash)
LIST(APPEND external_project_dependencies xxhash)
......@@ -57,5 +57,3 @@ ENDIF(WIN32)
ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
ADD_DEPENDENCIES(zlib extern_zlib)
LIST(APPEND external_project_dependencies zlib)
......@@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag)
if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
endif()
endif()
endif()
endif()
endfunction()
......@@ -147,12 +147,7 @@ set(GPU_COMMON_FLAGS
-Wno-error=unused-function # Warnings in Numpy Header.
-Wno-error=array-bounds # Warnings in Eigen::array
)
else(NOT WIN32)
set(COMMON_FLAGS
"/w") #disable all warnings.
set(GPU_COMMON_FLAGS
"/w") #disable all warnings
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
endif(NOT WIN32)
if (APPLE)
......@@ -193,8 +188,7 @@ safe_set_static_flag()
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
if(${flag_var} MATCHES "/W3")
string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}")
endif(${flag_var} MATCHES "/W3")
string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
set(flag_var "${flag_var} /w")
endforeach(flag_var)
endif(WIN32)
......@@ -11,8 +11,6 @@ include_directories("/opt/rocm/rocrand/include")
include_directories("/opt/rocm/rccl/include")
include_directories("/opt/rocm/thrust")
list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
if(WITH_DSO)
......@@ -31,22 +29,12 @@ if(WITH_GRPC)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
endif(WITH_GRPC)
if(NOT WITH_GOLANG)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG")
endif(NOT WITH_GOLANG)
if(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
endif(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
if(NOT WITH_RDMA)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA")
endif(NOT WITH_RDMA)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
......
......@@ -52,8 +52,8 @@ function(op_library TARGET)
endif()
if(WITH_MKLDNN)
string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
endif()
endif()
else()
......@@ -153,7 +153,11 @@ function(op_library TARGET)
# pybind USE_OP_DEVICE_KERNEL for CUDNN
list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
if(${TARGET} STREQUAL "activation")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
endif()
# pybind USE_OP_DEVICE_KERNEL for MIOPEN
......@@ -168,6 +172,9 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif()
......
# user should download rdma first from subversion repository
# execute following instruction to download svn mannally
# svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/
# svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
# we use static output in svn repositories to avoid implict bugs from not standard runtime env.
if(WITH_RDMA)
set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
function(generate_rdma_links)
#redirect to current DIR to isolate the pollution from system runtime environment
#it can benifits unified control for different gcc environment.
#e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
#runtime libraries that will crash process while loading it. That redirect trick
#can fix it.
execute_process(
COMMAND mkdir -p librdma
COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so
COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so.1
COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endfunction(generate_rdma_links)
#check and set headers
find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
#check and set libs
find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
if(
RDMA_INC_SXISOCK AND
RDMA_INC_XIO AND
RDMA_INC_EVENT AND
RDMA_INC_NUMA AND
RDMA_LIB_SXISOCK AND
RDMA_LIB_XIO AND
RDMA_LIB_EVENT AND
RDMA_LIB_EVENT_CORE AND
RDMA_LIB_EVENT_EXTRA AND
RDMA_LIB_EVENT_PTHREADS AND
RDMA_LIB_NUMA
)
set(RDMA_INC_DIR
${RDMA_INC_SXISOCK}
${RDMA_INC_XIO}
${RDMA_INC_EVENT}
${RDMA_INC_NUMA})
set(RDMA_LIBS
${RDMA_LIB_SXISOCK}
${RDMA_LIB_XIO}
${RDMA_LIB_EVENT}
${RDMA_LIB_EVENT_CORE}
${RDMA_LIB_EVENT_EXTRA}
${RDMA_LIB_EVENT_PTHREADS}
${RDMA_LIB_NUMA}
)
set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
include_directories("${RDMA_INC_DIR}")
else()
#if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
endif()
else(WITH_RDMA)
set(RDMA_LIBS "")
set(RDMA_LD_FLAGS "")
add_definitions(-DPADDLE_DISABLE_RDMA)
endif(WITH_RDMA)
......@@ -33,6 +33,5 @@ if(TENSORRT_FOUND)
message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
"Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
include_directories(${TENSORRT_INCLUDE_DIR})
list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
add_definitions(-DPADDLE_WITH_TENSORRT)
endif()
......@@ -31,8 +31,23 @@ while ("${PADDLE_VERSION}" STREQUAL "")
set(tmp_version "${GIT_TAG_NAME}~1")
endif()
else()
# otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
set(PADDLE_VERSION "0.0.0")
execute_process(
COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE GIT_EXACT_TAG_NAME
RESULT_VARIABLE GIT_EXACT_TAG_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if (NOT ${GIT_EXACT_TAG_NAME})
# Check if current branch is tag branch
if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME})
else()
set(PADDLE_VERSION "0.0.0")
endif()
else()
# otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
set(PADDLE_VERSION "0.0.0")
endif()
endif()
else()
set(PADDLE_VERSION "0.0.0")
......
......@@ -5,13 +5,13 @@ Kexin Zhao <zhaokexin01@baidu.com>
## Introduction
Deep learning is usually a two-stage work: training and inference. The training stage estimates model parameters (weights) from data. The inference stage loads the weights and uses them to interpret inputs. Typically, weights are 32-bit float values (float32). Some new devices, including NVIDIA Volta GPUs, support higher speed computation using 16-bit float values (float16).
This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16.
This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16.
## What is float16?
float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float or float32 data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has a high tolerance for the loss of precision and range when using float16 to represent the weights, and the inference accuracy will only be minimally affected in most cases, which gives us the opportunity to use float16 data type to speed up the inference.
Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
## Why float16?
The trend in today's deep learning community is to use bigger and deeper model, which translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float32 are correspondingly three-fold:
......@@ -24,12 +24,12 @@ The trend in today's deep learning community is to use bigger and deeper model,
## Fluid implementation of float16 inference
### Overview
Fluid use [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block.
Fluid use [Program](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block.
### Basic requirement
When an executor runs an operator, it uses a kernel to perform computations on tensors contained in the input variables, and then writes the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.
If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type.
If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type.
The same principle applies if we want a program to run in float16 mode. We provide input variable of the float16 data type to the first operator, and every subsequent operator will invoke the float16 kernel until we get the final output in float16. So the preliminary requirements for float16 inference are to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.
......@@ -75,7 +75,7 @@ In this scenario, we already have a float32 inference program and some associate
We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. Consequently, we need to add cast operators in the float16 inference program for conversions between the float16 tensor and float32 tensor.
The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
### Experiment results
Simply running the following commands to reproduce the experiment results presented in this section:
......@@ -113,7 +113,7 @@ We repeat the test ten times and get the following results:
| #10 | 62.53% | 62.48% |
| average| 62.63% | 62.62% |
We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests.
We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests.
#### Performance benchmark
Currently, Fluid only supports float16 inference on NVIDIA GPUs. There is no motivation to support float16 inference on non-ARM CPUs where float16 is not natively supported, and float16 calculation will only be slower than its float32 counterpart.
......@@ -132,7 +132,7 @@ Average inference time for one mini-batch on Vgg16 model tested on ImageNet data
|float16| 3.32 | 4.11 | 5.88 | 9.41 | 16.54 | 30.47 | 60.23 |
|Speedup| 4.22 | 2.36  | 3.91 | 3.00 | 3.26  | 2.77 | 2.97 |
We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes.
We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes.
Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows:
......@@ -162,7 +162,7 @@ We find that the speedup provided by float16 inference starts relatively small a
We also did the same benchmark on a single NVIDIA GeForce GTX 1080 Ti GPU that does not support Tensor Core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then delivers around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that Tensor Core, which is specialized for float16 computations, is a critical component of high performance float16 inference.
Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for complete benchmark results.
Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/float16/float16_benchmark.md) for complete benchmark results.
### Summary
1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode.
......
......@@ -14,9 +14,7 @@ cmake .. -DWITH_AVX=OFF \
-DWITH_MKL=OFF \
-DWITH_GPU=ON \
-DWITH_TESTING=ON \
-DWITH_TIMER=ON \
-DWITH_PROFILER=ON \
-DWITH_FLUID_ONLY=ON
make -j `nproc`
pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)"
......
此差异已折叠。
......@@ -38,10 +38,10 @@ if(WITH_GPU)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util)
else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
endif(WIN32)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
......@@ -128,7 +128,7 @@ cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc memory_optimize_helper)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
......@@ -158,18 +158,19 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper)
if(WITH_NGRAPH)
set(NGRAPH_EXE_DEPS ngraph_engine)
else()
set(NGRAPH_EXE_DEPS)
endif()
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS})
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else()
if (WITH_NGRAPH)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine)
else ()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
endif()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS})
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif()
......@@ -192,6 +193,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
proto_desc)
cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper)
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
......
......@@ -244,6 +244,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
auto& block = main_program.Block(0);
for (auto var_name : fetch_var_names) {
auto var_desc = block.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var_desc, "%s is not found.", var_name);
auto shapes = var_desc->GetShape();
PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
"var %s: Fetched var has wrong shape, "
......
......@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/block_desc.h"
#include <queue>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
......@@ -155,6 +159,16 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
ops_.erase(ops_.begin() + s, ops_.begin() + e);
}
void BlockDesc::RemoveOpInternal(const OpDesc *op_desc) {
// TODO(minqiyang): make this faster
for (auto it = ops_.begin(); it != ops_.end(); ++it) {
if (it->get() == op_desc) {
ops_.erase(it);
break;
}
}
}
std::vector<OpDesc *> BlockDesc::AllOps() const {
std::vector<OpDesc *> res;
for (const auto &op : ops_) {
......
......@@ -93,6 +93,8 @@ class BlockDesc {
*/
void RemoveOp(size_t s, size_t e);
void RemoveOpInternal(const OpDesc *op_desc);
void RemoveVar(const std::string &name) { vars_.erase(name); }
std::vector<OpDesc *> AllOps() const;
......
......@@ -134,11 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
out_layout =
out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
auto& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
pool.Get(expected_kernel_type.place_));
auto& cpu_engine = dev_ctx->GetEngine();
std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
std::vector<int> out_tz = in_tz;
......@@ -147,29 +142,25 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
"Input tensor type is not supported: %s", in.type());
memory::data_type out_type = in_type;
auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
auto out_format =
platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
// output tensor has the same dims as input. Reorder don't change dims
out->Resize(in.dims());
if (in_format != out_format) {
// tempory mem pd fr out , to make reorder
auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(out->dims()),
mkldnn::memory::format::blocked, out_type);
if (in.get_mkldnn_prim_desc() != out_mem_pd) {
void* in_data = GetDataFromTensor(in, in_type);
auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
auto in_memory =
memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
auto out_memory =
memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data);
auto out_memory = memory(out_mem_pd, out_data);
platform::Reorder(in_memory, out_memory);
} else {
out->ShareDataWith(in);
}
out->set_layout(out_layout);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel
out->set_format(memory::format::format_undef);
#endif
}
......
......@@ -51,13 +51,31 @@ void TransformData(const OpKernelType &expected_kernel_type,
#ifdef PADDLE_WITH_MKLDNN
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur
auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
ToMKLDNNFormat(lin));
out.ShareDataWith(input_tensor);
out.set_layout(DataLayout::kMKLDNN);
out.set_format(out_format);
// TODO(jczaja): Remove that once all mkldnn ops
// are modified to work with mkldnn_blocked
auto mkldnn_fmt = [&](int rank) {
switch (rank) {
case 5:
return mkldnn::memory::format::ncdhw;
case 4:
return mkldnn::memory::format::nchw;
case 3:
return mkldnn::memory::format::ncw;
case 2:
return mkldnn::memory::format::nc;
case 1:
return mkldnn::memory::format::x;
default:
return mkldnn::memory::format::blocked;
}
};
auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
paddle::framework::vectorize2int(out.dims()),
mkldnn_fmt(out.dims().size()));
out.set_mkldnn_prim_desc(out_mem_pd);
#endif
} else {
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
......
......@@ -50,10 +50,15 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass)
if(WITH_GPU)
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
else()
cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
endif()
cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
......@@ -65,13 +70,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
if (WITH_GPU)
list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
endif()
cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph)
cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass)
cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
......
......@@ -30,8 +30,6 @@ namespace paddle {
namespace framework {
namespace details {
static constexpr char kAllOpDescs[] = "all_op_descs";
VarHandle* GetValidInput(const OpHandleBase* a) {
for (auto p : a->Inputs()) {
VarHandle* b = dynamic_cast<VarHandle*>(p);
......@@ -52,7 +50,7 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
std::unordered_map<std::string, int> vars;
// TODO(gongwb): use graph topology sort to find the order of operators.
// Note that must assert topology sort is stable
auto& ops = Get<const std::vector<OpDesc*>>(kAllOpDescs);
auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
for (auto* op_desc : ops) {
auto outputs = op_desc->Outputs();
for (auto& o_it : outputs) {
......@@ -122,4 +120,4 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
REGISTER_PASS(all_reduce_deps_pass,
paddle::framework::details::AllReduceDepsPass)
.RequirePassAttr(paddle::framework::details::kAllOpDescs);
.RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs);
......@@ -53,7 +53,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif
void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
platform::RecordEvent record_event(Name());
WaitInputVarGenerated();
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
......
......@@ -22,7 +22,7 @@ namespace framework {
namespace details {
void BroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
platform::RecordEvent record_event(Name());
if (places_.size() == 1) return;
......@@ -30,7 +30,7 @@ void BroadcastOpHandle::RunImpl() {
VarHandle *in_var_handle;
{
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one.");
in_var_handle = in_var_handles[0];
}
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include <glog/logging.h>
#include <memory>
#include "paddle/fluid/framework/details/memory_reuse_types.h"
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
......@@ -34,9 +34,11 @@ namespace details {
static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
// Should fix the allreduce op order if scheduling
// them in multiple threads or processes to avoid hang.
// NOTE: ParallelGraph would execute this pass on each graph, so
// don't need to append it here.
return (!strategy.enable_sequential_execution_ &&
strategy.num_trainers_ > 1) ||
strategy.enable_parallel_graph_;
strategy.num_trainers_ > 1) &&
!strategy.enable_parallel_graph_;
}
class ParallelExecutorPassBuilder : public ir::PassBuilder {
......@@ -47,6 +49,22 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("sequential_execution_pass");
}
// Add op fusion.
if (strategy.fuse_relu_depthwise_conv_) {
AppendPass("fuse_relu_depthwise_conv_pass");
}
// NOTE(dzhwinter): A note for automatical inplace.
// 1. modify program desc passes should put
// before inplace pass.
// 2. manually configured inplace should put
// before inplace_pass
// Add automatically inplace.
if (strategy_.enable_inplace_) {
AppendPass("inplace_pass");
}
// Add a graph viz pass to record a graph.
if (!strategy_.debug_graphviz_path_.empty()) {
auto viz_pass = AppendPass("graph_viz_pass");
......@@ -55,10 +73,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
}
// Add op fusion.
if (strategy.fuse_relu_depthwise_conv_) {
AppendPass("fuse_relu_depthwise_conv_pass");
}
if (strategy.fuse_elewise_add_act_ops_) {
auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
// Add a graph viz pass to record a graph.
......@@ -88,7 +102,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
// A side-effect of that, memory optimize cannot forsee the fetched vars
// , so fetchlist should be set persistable before call the Run interface.
if (strategy.memory_optimize_) {
auto analysis_var_pass = AppendPass("analysis_var_pass");
auto memory_optimize_pass = AppendPass("memory_optimize_pass");
}
AppendMultiDevPass(strategy);
......@@ -121,12 +135,15 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
void AppendMultiDevPass(const BuildStrategy &strategy) {
ir::Pass *multi_devices_pass;
if (strategy_.is_distribution_) {
VLOG(3) << "multi device parameter server mode";
multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
} else {
if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
VLOG(3) << "multi devices collective mode with allreduce";
multi_devices_pass =
AppendPass("allreduce_mode_multi_devices_pass").get();
} else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
VLOG(3) << "multi deivces collective mode with reduce";
multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
} else {
PADDLE_THROW("Unknown reduce strategy.");
......@@ -157,7 +174,8 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
}
std::unique_ptr<ir::Graph> BuildStrategy::Apply(
const ProgramDesc &main_program, const std::vector<platform::Place> &places,
std::unique_ptr<ir::Graph> graph,
const std::vector<platform::Place> &places,
const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
const size_t &nranks,
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
......@@ -168,7 +186,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
// Create a default one if not finalized by user.
CreatePassesFromStrategy(false);
std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
if (IsMultiDevPass(pass->Type())) {
pass->Erase(kPlaces);
......@@ -186,34 +203,12 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass->Erase("nccl_ctxs");
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
#endif
} else if (pass->Type() == "analysis_var_pass") {
const std::vector<OpDesc *> *all_op_descs =
new std::vector<OpDesc *>(main_program.Block(0).AllOps());
graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
all_op_descs); // take ownership
graph->Set<GraphNodePool>(kGraphNodePool,
new GraphNodePool); // take ownership
pass->Erase(kAllOpDescs);
pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
} else if (pass->Type() == "sequential_execution_pass") {
LOG(INFO) << "set enable_sequential_execution:"
<< enable_sequential_execution_;
pass->Erase(kAllOpDescs);
pass->Set<const std::vector<OpDesc *>>(
kAllOpDescs,
new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
} else if (pass->Type() == "all_reduce_deps_pass") {
LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
<< ", num_trainers:" << num_trainers_;
pass->Erase(kAllOpDescs);
pass->Set<const std::vector<OpDesc *>>(
kAllOpDescs,
new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
} else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
if (!use_cuda) {
LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
......@@ -221,7 +216,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
continue;
}
}
VLOG(3) << "Start Apply Pass " << pass->Type();
graph = pass->Apply(std::move(graph));
VLOG(3) << "Finish Apply Pass " << pass->Type();
}
return graph;
}
......@@ -239,9 +236,10 @@ USE_PASS(allreduce_mode_multi_devices_pass);
USE_PASS(dist_multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
USE_PASS(analysis_var_pass);
USE_PASS(memory_optimize_pass);
USE_PASS(sequential_execution_pass);
USE_PASS(all_reduce_deps_pass);
USE_PASS(modify_op_lock_and_record_event_pass);
USE_PASS(inplace_pass);
USE_PASS(lock_free_optimize_pass);
USE_PASS(graph_to_program_pass);
......@@ -14,6 +14,7 @@
#pragma once
#include <memory>
#include <string>
#include <vector>
......@@ -76,9 +77,11 @@ struct BuildStrategy {
bool fuse_relu_depthwise_conv_{false};
bool memory_optimize_{false};
bool memory_early_delete_{false};
bool memory_optimize_{true};
// TODO(dzhwinter):
// make enable_inplace, memory_optimize_
// memory_early_delete_ true by default
bool enable_inplace_{true};
bool enable_sequential_execution_{false};
......@@ -112,7 +115,7 @@ struct BuildStrategy {
// Apply the passes built by the pass_builder_. The passes will be
// applied to the Program and output an ir::Graph.
std::unique_ptr<ir::Graph> Apply(const ProgramDesc &main_program,
std::unique_ptr<ir::Graph> Apply(std::unique_ptr<ir::Graph> graph,
const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::vector<Scope *> &local_scopes,
......
......@@ -26,7 +26,7 @@
namespace paddle {
namespace framework {
namespace details {
struct ComputationOpHandle : public OpHandleBase {
class ComputationOpHandle : public OpHandleBase {
public:
ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
size_t scope_idx);
......
......@@ -86,7 +86,7 @@ std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
}
void DataBalanceOpHandle::RunImpl() {
PADDLE_ENFORCE_GT(places_.size(), 1,
PADDLE_ENFORCE_GT(places_.size(), 1UL,
"Data balance can only be enabled when the number of "
"places to run larger than 1.");
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
......
......@@ -12,7 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
......@@ -24,12 +26,11 @@ namespace details {
FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
std::unique_ptr<ir::Graph> &&graph)
const std::vector<platform::Place> &places, ir::Graph *graph)
: strategy_(strategy),
local_scopes_(local_scopes),
places_(places),
graph_(std::move(graph)),
graph_(graph),
pool_(strategy.num_threads_),
prepare_pool_(1), // add one more thread for generate op_deps
fetch_ctxs_(places) {
......@@ -56,7 +57,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
std::vector<FetchOpHandle *> fetch_ops;
for (auto &fetch_var_name : fetch_tensors) {
for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) {
fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
......@@ -110,14 +111,14 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
}
}
if (exception_.IsCaught()) {
ClearFetchOp(graph_.get(), &fetch_ops);
ClearFetchOp(graph_, &fetch_ops);
exception_.ReThrow();
}
}
num_complete += num_comp;
}
// Wait FetchOps.
ClearFetchOp(graph_.get(), &fetch_ops);
ClearFetchOp(graph_, &fetch_ops);
return fetches;
}
......
......@@ -32,7 +32,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
std::unique_ptr<ir::Graph> &&graph);
ir::Graph *graph);
FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
const ir::Graph &Graph() const override;
......@@ -40,7 +40,7 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
ExecutionStrategy strategy_;
std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_;
std::unique_ptr<ir::Graph> graph_;
ir::Graph *graph_;
std::unordered_map<OpHandleBase *, int> op_deps_;
std::vector<OpHandleBase *> bootstrap_ops_;
......
......@@ -23,7 +23,7 @@ void FuseVarsOpHandle::RunImpl() {
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(in_var_handles.size(), 0);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
......
......@@ -22,7 +22,7 @@ namespace framework {
namespace details {
void FusedBroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
platform::RecordEvent record_event(Name());
if (places_.size() == 1UL) return;
......
......@@ -34,8 +34,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
->Var(details::kLocalExecScopeName)
->GetMutable<Scope*>() = &local_scope;
for (size_t j = 0; j < input_scope_idxes.size(); ++j) {
local_scope.Var("out_var" + j);
if (i == j) local_scope.Var("in_var" + j);
local_scope.Var("out_var" + std::to_string(j));
if (i == j) local_scope.Var("in_var" + std::to_string(j));
}
param_scopes_.emplace_back(&local_scope);
}
......@@ -62,20 +62,21 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
// add input var handle
nodes_.emplace_back(
ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable));
VarHandle* in_var_handle =
new VarHandle(nodes_.back().get(), 1, input_scope_idxes[i],
"in_var" + i, place_list_[input_scope_idxes[i]]);
nodes_.emplace_back(ir::CreateNodeForTest("in_node" + std::to_string(i),
ir::Node::Type::kVariable));
VarHandle* in_var_handle = new VarHandle(
nodes_.back().get(), 1, input_scope_idxes[i],
"in_var" + std::to_string(i), place_list_[input_scope_idxes[i]]);
vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle);
// add output var handle
for (size_t j = 0; j < place_list_.size(); ++j) {
nodes_.emplace_back(
ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable));
VarHandle* out_var_handle = new VarHandle(
nodes_.back().get(), 2, j, "out_var" + i, place_list_[j]);
nodes_.emplace_back(ir::CreateNodeForTest(
"out_node" + std::to_string(i), ir::Node::Type::kVariable));
VarHandle* out_var_handle =
new VarHandle(nodes_.back().get(), 2, j,
"out_var" + std::to_string(i), place_list_[j]);
vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
}
......@@ -86,7 +87,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
std::vector<std::vector<float>> send_vec;
f::LoD lod{{0, 10, 20}};
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string varname("in_var" + i);
const std::string varname("in_var" + std::to_string(i));
float val_scalar = static_cast<float>(i);
send_vec.push_back(
InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
......@@ -96,7 +97,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
WaitAll();
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string& varname("out_var" + i);
const std::string& varname("out_var" + std::to_string(i));
for (size_t j = 0; j < place_list_.size(); ++j) {
LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]);
}
......@@ -109,7 +110,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
int height = static_cast<int>(kDims[0] * 2);
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string varname("in_var" + i);
const std::string varname("in_var" + std::to_string(i));
float val_scalar = static_cast<float>(i);
send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i],
rows, height, val_scalar));
......@@ -119,7 +120,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
WaitAll();
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
const std::string& varname("out_var" + i);
const std::string& varname("out_var" + std::to_string(i));
for (size_t j = 0; j < place_list_.size(); ++j) {
SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows,
height);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <iostream>
#include <iterator>
#include <string>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
namespace framework {
class DummyOp : public OperatorBase {
public:
DummyOp(const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const Scope& scope,
const platform::Place& place) const override {}
};
class SumOpMaker : public OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "").AsDuplicable();
AddOutput("Out", "");
AddComment("");
}
};
class AssignOpMaker : public OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "").AsDuplicable();
AddOutput("Out", "");
AddComment("");
}
};
class SplitOpMaker : public OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "");
AddOutput("Out", "").AsDuplicable();
AddComment("");
}
};
class DummyVarTypeInference : public VarTypeInference {
public:
void operator()(const OpDesc& op_desc, BlockDesc* block) const override {
auto& inputs = op_desc.Input("X");
auto type = block->Var(inputs.front())->GetType();
auto out_var_name = op_desc.Output("Out").front();
block->Var(out_var_name)->SetType(type);
}
};
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/inplace_op_pass.h"
#include <algorithm>
#include <deque>
#include <iterator>
#include <stack>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/details/memory_optimize_pass.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_info.h"
// NOTE(dzhwinter): inplace means one op output variable reuse the input space.
// By our design, one operator only can read its input(const Variable),
// write its output(non-const Variable). If one operator is inplaced, means
// user have chance to write the space before reading happens.
// Especially when some optimize code writing style is applied.
//
//
// /* wrong case in operator */
// /*In this case, a larger allocation is allocated, input content is lost*/
// const Tensor* in = ctx.Input<Tensor>("In")
// Tensor* out = ctx.Output<Tensor>("Out");
// auto* out_ptr = out->mutable_data<T>(ctx.GetPlace());
// out_ptr[0] = 0; // input contect is overwrited.
// NOTE(dzhwinter):
// Only for backward compacity and stable. if enable_inplace_whitelist is turn
// on.
// only the ops in whitelist will be use inplace strategy.
// if not, all the op will be inplaced if it registered with InplaceClass
DEFINE_bool(
enable_inplace_whitelist, false,
"If this option turns on, only these op in whitelist can be inplaced."
"If it turns off, all of the running op can be candidate of inplaced op."
"Such as scale, elementwise_add"
"By default, it's turned off");
DECLARE_string(memory_optimize_debug);
// clang-format off
const std::string kInplacedOpWhiteList[] = { // NOLINT
"sigmoid",
"exp",
"relu",
"tanh",
"sqrt",
"ceil",
"floor",
"reciprocal",
"relu6",
"soft_relu",
"hard_sigmoid",
"batch_norm",
"batch_norm_grad",
"sum",
"sum_grad",
"scale",
"reshape",
"elementwise_add",
"elementwise_add_grad",
};
// clang-format on
namespace paddle {
namespace framework {
namespace details {
static inline ir::Node* GetNextCascadeInplacedVar(ir::Node* var) {
// if next op is inplaced, then return the output var
// otherwise return nullptr
PADDLE_ENFORCE(var && var->IsVar() && !var->IsCtrlVar());
ir::Node* inplaced_var = nullptr;
for (auto* next_op : var->outputs) {
for (auto* output : next_op->outputs) {
if (output->IsVar() && !output->IsCtrlVar() &&
output->Name() == var->Name()) {
inplaced_var = output;
}
}
}
return inplaced_var;
}
static inline ir::Node* GetPrevCascadeInplacedVar(ir::Node* var) {
PADDLE_ENFORCE(var && var->IsVar() && !var->IsCtrlVar());
if (var->inputs.empty()) return nullptr;
auto* prev_op = var->inputs.at(0);
auto input_it = std::find_if(prev_op->inputs.begin(), prev_op->inputs.end(),
[&](ir::Node* node) {
if (node->IsVar() && !node->IsCtrlVar() &&
node->Name() == var->Name()) {
return true;
} else {
return false;
}
});
return input_it == prev_op->inputs.end() ? nullptr : *input_it;
}
InplacePass::InplacePass() : Pass() {
if (FLAGS_enable_inplace_whitelist) {
for (auto& s : kInplacedOpWhiteList) {
whitelist_.emplace(s);
}
}
}
void InplacePass::InitSSAGraphNodes() const {
std::unordered_map<std::string, std::unordered_set<ir::Node*>> all_vars;
for (auto* op : view_.AllOps()) {
for (auto* node : op->inputs) {
if (!node->IsVar() || node->IsCtrlVar()) continue;
if (all_vars[node->Name()].count(node) == 0) {
all_vars[node->Name()].emplace(node);
var_nodes_[node->Name()].emplace_back(node);
}
}
for (auto* node : op->outputs) {
if (!node->IsVar() || node->IsCtrlVar()) continue;
if (all_vars[node->Name()].count(node) == 0) {
all_vars[node->Name()].emplace(node);
var_nodes_[node->Name()].emplace_back(node);
}
}
}
}
std::unique_ptr<ir::Graph> InplacePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
var_nodes_.clear();
view_.Build(graph.get());
InitSSAGraphNodes();
for (auto* op : view_.AllOps()) {
if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name()))
continue;
TryInplaceOpInputOutput(op, graph.get());
}
graph->ResolveHazard(var_nodes_);
return graph;
}
void InplacePass::InplaceModifyDesc(const std::string& var,
const std::string& cache_var,
const size_t& idx) const {
for (size_t i = idx; i < view_.AllOps().size(); ++i) {
ir::Node* op = view_.AllOps()[i];
PADDLE_ENFORCE(op->IsOp() && op->Op());
auto* op_desc = op->Op();
op_desc->RenameInput(var, cache_var);
op_desc->RenameOutput(var, cache_var);
if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
op_desc->Flush();
}
}
const NodeSwapQueue InplacePass::TryInplaceModifyVar(
const std::string& var, const std::string& cache_var, const size_t& idx,
ir::Graph* graph) const {
PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
var_nodes_[var].at(0)->Var() != nullptr);
std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
var_desc->SetName(cache_var);
NodeSwapQueue swap_nodes;
for (size_t i = idx; i < view_.AllOps().size(); ++i) {
auto* op = view_.AllOps()[i];
// redirect the input to the latest version of cache_var
for (auto* node : op->inputs) {
if (node->Name() == var) {
ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
// swap node to cache_node
cache_node->outputs.insert(cache_node->outputs.end(),
node->outputs.begin(), node->outputs.end());
PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp());
auto* prev_op = node->inputs[0];
std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
cache_node);
cache_node->inputs.emplace_back(prev_op);
for (auto* next_op : node->outputs) {
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node);
}
swap_nodes.emplace_back(std::make_pair(node, cache_node));
}
}
// if we need to rename the output,
// always create a newer version of cache_var
for (auto* node : op->outputs) {
if (node->Name() == var) {
ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
// swap node to cache node
cache_node->outputs.insert(cache_node->outputs.end(),
node->outputs.begin(), node->outputs.end());
cache_node->inputs.emplace_back(op);
std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node);
for (auto* next_op : node->outputs) {
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node);
}
swap_nodes.emplace_back(std::make_pair(node, cache_node));
}
}
}
return swap_nodes;
}
void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes,
ir::Graph* graph) const {
for (auto& pair : swap_nodes) {
auto *node = pair.first, *cache_node = pair.second;
const std::string var = node->Name(), cache_var = cache_node->Name();
var_nodes_[cache_var].emplace_back(cache_node);
graph->RemoveNode(node);
auto& nodes = var_nodes_.at(var);
// release unused var in graph. Because python side memory optimize
// may reused the var in same name, so we only clear the var node
// after current inplaced index.
nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
}
}
void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
ir::Graph* graph) const {
for (auto& pair : nodes) {
auto *node = pair.first, *cache_node = pair.second;
const std::string var = node->Name(), cache_var = cache_node->Name();
auto* prev_op = node->inputs[0];
std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), cache_node,
node);
for (auto* next_op : node->outputs) {
std::replace(next_op->inputs.begin(), next_op->inputs.end(), cache_node,
node);
}
graph->RemoveNode(cache_node);
}
}
void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
ir::Graph* graph) const {
VLOG(4) << "Try to inplace op " << op->Name();
PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
"op_desc is nullptr");
// some pre-requirments need to meet if the op want to inplaced.
auto* op_desc = op->Op();
auto& infer_inplace =
OpInfoMap::Instance().Get(op_desc->Type()).infer_inplace_;
// 1. infer_inplace_ is registered.
if (!static_cast<bool>(infer_inplace)) return;
PADDLE_ENFORCE(static_cast<bool>(infer_inplace),
"%s's infer_inplace has not been registered", op_desc->Type());
auto* block = op_desc->Block();
auto in_to_outs = infer_inplace(*op_desc, block);
auto& all_ops = view_.AllOps();
auto cursor = std::find(all_ops.begin(), all_ops.end(), op);
size_t idx = std::distance(all_ops.begin(), cursor);
for (auto& pair : in_to_outs) {
auto& in_var_name = pair.first;
auto& out_var_name = pair.second;
auto* in_node = view_.GetNodeByName(in_var_name, op->inputs);
auto* out_node = view_.GetNodeByName(out_var_name, op->outputs);
// 2. there is no external pending op on the input node
if (view_.PendingOpsOnVar(in_node).size() > 1) {
VLOG(4) << string::Sprintf(
"Skiped pair %s => %s. %s input has external dependency."
"inplace such pair will overwrite the memory.",
out_var_name, in_var_name, op->Name());
continue;
}
// 3. if output has been memory optimize by python(fluid.memory_optmize()).
// this candidate can not be inplaced. Will be deprecated in the future.
if (view_.InSkipSet(out_node->Name())) {
VLOG(4) << string::Sprintf(
"Skiped %s => %s reused previous memory block in python memory "
"optmize,"
"it inplace may generate a circle",
out_var_name, in_var_name, op->Name());
continue;
}
// Debug Interface. Which would be skipped by the pass.
if (out_node->Name() == FLAGS_memory_optimize_debug) {
VLOG(3) << "Skiped var by force. FLAGS_memory_optimize_debug="
<< out_node->Name();
continue;
}
// NOTE(dzhwinter):
// two stage commit of inplaced process. if after inplace happens generate a
// circle,
// then withdraw the changes. Otherwise, safely add the node.
auto swap_nodes =
TryInplaceModifyVar(out_var_name, in_var_name, idx, graph);
if (!ir::HasCircle(*graph)) {
VLOG(3) << string::Sprintf("!!! %s, %s => %s inplaced", op->Name(),
out_var_name, in_var_name);
InplaceModifyDesc(out_var_name, in_var_name, idx);
CommitModify(swap_nodes, graph);
} else {
VLOG(3) << string::Sprintf(
"Skiped pair %s => %s, inplace will generate a circle. withdraw %s",
out_var_name, in_var_name, op->Name());
WithdrawModify(swap_nodes, graph);
}
}
}
ir::Node* GraphView::GetNodeByName(const std::string& name,
const std::vector<ir::Node*>& nodes) const {
// nodes should be op->inputs/outputs
// node in same node do have different name.
std::unordered_set<std::string> nodes_in_op;
bool has_dup_node =
std::all_of(nodes.begin(), nodes.end(), [&nodes_in_op](ir::Node* node) {
if (!node->IsVar() || node->IsCtrlVar() || node->Var() == nullptr) {
if (nodes_in_op.count(node->Name())) return true;
nodes_in_op.emplace(node->Name());
}
return false;
});
PADDLE_ENFORCE(has_dup_node == false, "nodes has same name!");
ir::Node* node = nullptr;
for (auto* it : nodes) {
if (!it->IsVar() || it->IsCtrlVar() || it->Var() == nullptr) continue;
if (it->Name() == name) {
node = it;
break;
}
}
PADDLE_ENFORCE(node != nullptr,
string::Sprintf("Not found var %s in nodes!", name));
return node;
}
std::vector<ir::Node*> GraphView::PendingOpsOnVar(ir::Node* node) {
// get the pending ops depends on same var node.
// because node also maybe a inplaced variable, so need to backtrack all the
// previous inplaced vars.
std::vector<ir::Node*> pending_ops;
ir::Node* p = node;
while (p != nullptr) {
pending_ops.insert(pending_ops.end(), p->outputs.begin(), p->outputs.end());
p = GetPrevCascadeInplacedVar(p);
}
return pending_ops;
}
void GraphView::Build(ir::Graph* g) {
// track the var nodes in correct order.
// Because we insert some new created node. Which may have data race between
// nodes.
// resolve data harzards depends on the var nodes in right order.
ops_ = SortOpLikeDescOrder(*g);
// 1. track the nodes which reused previous node in Python memory optimize.
// these node can not be inplaced, otherwise may generate a circle in graph.
std::unordered_set<std::string> all_vars;
for (auto& node : g->Nodes()) {
if (node->IsVar()) continue;
for (auto& out : node->outputs) {
if (out->IsCtrlVar() || out->Var() == nullptr) continue;
if (all_vars.count(out->Name())) {
dup_nodes_.emplace(out->Name());
} else {
all_vars.emplace(out->Name());
}
}
}
// 2. track the nodes which used by parameter server.
// these node can not be inplaced, otherwise trainer
// pserver can not find each other name.
auto update_skip_set = [&](ir::Node* node) {
for (auto& in : node->inputs) {
if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
}
for (auto& out : node->outputs) {
if (out->IsVar() && out->Var() != nullptr)
dup_nodes_.emplace(out->Name());
}
};
for (auto& node : g->Nodes()) {
if (!node->IsOp()) continue;
if (node->Name() == "send") update_skip_set(node);
if (node->Name() == "recv") update_skip_set(node);
if (node->Name() == "prefetch") update_skip_set(node);
}
}
const std::vector<ir::Node*>& GraphView::AllOps() { return ops_; }
bool GraphView::InSkipSet(const std::string& var) const {
return dup_nodes_.count(var);
}
} // namespace details
} // namespace framework
} // namespace paddle
REGISTER_PASS(inplace_pass, paddle::framework::details::InplacePass);
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -36,13 +36,14 @@ namespace details {
// map from variable name to variables. The variables, who have the same name,
// will have a differsent version. The offset in the
// `std::vector<VarHandle*>` is the version of varaibles.
typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle*>>>
typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
GraphVars;
const char kGraphVars[] = "vars";
// aux variables to represent dependency. Useful to resolve data hazard.
typedef std::unordered_set<VarHandleBase*> GraphDepVars;
typedef std::unordered_set<VarHandleBase *> GraphDepVars;
const char kGraphDepVars[] = "dep_vars";
} // namespace details
} // namespace framework
} // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册