提交 73fa5ef3 编写于 作者: X xiexionghang

fork from paddlev1.4, branch:paddle_feed_news_201910

上级 2455cb5f

要显示的变更太多。

To preserve performance only 1000 of 1000+ files are displayed.
...@@ -27,18 +27,27 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " ...@@ -27,18 +27,27 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
message(STATUS "AR tools: ${CMAKE_AR}") message(STATUS "AR tools: ${CMAKE_AR}")
if(WIN32) if(WIN32)
option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
set(CMAKE_SUPPRESS_REGENERATION ON) set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib) set(CMAKE_STATIC_LIBRARY_PREFIX lib)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=") add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") if (MSVC_STATIC_CRT)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
endif()
add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838) add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221") set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
else(WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations")
endif(WIN32) endif(WIN32)
find_package(CUDA QUIET) find_package(CUDA QUIET)
...@@ -54,7 +63,6 @@ option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FO ...@@ -54,7 +63,6 @@ option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FO
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_CUSTOM_TRAINER "Turn on trainer implement by custom" OFF)
option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
...@@ -66,14 +74,15 @@ option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" ...@@ -66,14 +74,15 @@ option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools"
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(WITH_PSLIB "Compile with pslib support" OFF) option(WITH_PSLIB "Compile with pslib support" OFF)
option(WITH_BOX_PS "Compile with box_ps support" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF) option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF)
option(WITH_HIGH_LEVEL_API_TEST "Test fluid python high-level api interface" OFF) option(WITH_HIGH_LEVEL_API_TEST "Test fluid python high-level api interface" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON)
option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ON) option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ON)
option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
# PY_VERSION # PY_VERSION
if(NOT PY_VERSION) if(NOT PY_VERSION)
...@@ -83,7 +92,7 @@ set(PYBIND11_PYTHON_VERSION ${PY_VERSION}) ...@@ -83,7 +92,7 @@ set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
# CMAKE_BUILD_TYPE # CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE) if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING set(CMAKE_BUILD_TYPE "Release" CACHE STRING
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
FORCE) FORCE)
endif() endif()
...@@ -122,6 +131,12 @@ endif() ...@@ -122,6 +131,12 @@ endif()
if (REPLACE_ENFORCE_GLOG) if (REPLACE_ENFORCE_GLOG)
add_definitions("-DREPLACE_ENFORCE_GLOG") add_definitions("-DREPLACE_ENFORCE_GLOG")
endif() endif()
if (SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$")
message("Choose the correct type of sanitizer")
return()
endif()
######################################################################################## ########################################################################################
include(external/mklml) # download mklml package include(external/mklml) # download mklml package
...@@ -144,15 +159,11 @@ include(external/cub) ...@@ -144,15 +159,11 @@ include(external/cub)
include(external/rocprim) include(external/rocprim)
include(external/xxhash) # download xxhash include(external/xxhash) # download xxhash
include(external/dlpack) include(external/dlpack)
include(external/snappy) # download snappy
include(external/snappystream) # download snappystream
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(external/yaml-cpp) # download yaml
if (NOT WIN32) if (NOT WIN32)
# there is no official support of nccl, cupti in windows # there is no official support of nccl, cupti in windows
include(cupti) include(cupti)
include(external/gzstream)
endif (NOT WIN32) endif (NOT WIN32)
if(WITH_PSLIB) if(WITH_PSLIB)
...@@ -160,6 +171,9 @@ if(WITH_PSLIB) ...@@ -160,6 +171,9 @@ if(WITH_PSLIB)
include(external/pslib_brpc) include(external/pslib_brpc)
include(external/pslib) include(external/pslib)
endif(WITH_PSLIB) endif(WITH_PSLIB)
if(WITH_BOX_PS)
include(external/box_ps)
endif(WITH_BOX_PS)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
if(WITH_GRPC) if(WITH_GRPC)
...@@ -211,7 +225,6 @@ if (WITH_PROFILER) ...@@ -211,7 +225,6 @@ if (WITH_PROFILER)
endif() endif()
include(generic) # simplify cmake module include(generic) # simplify cmake module
include(package) # set paddle packages
include(ccache) # set ccache for compilation include(ccache) # set ccache for compilation
include(util) # set unittest and link libs include(util) # set unittest and link libs
include(version) # set PADDLE_VERSION include(version) # set PADDLE_VERSION
......
...@@ -54,8 +54,8 @@ RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ ...@@ -54,8 +54,8 @@ RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
RUN rm -r /root/python_build RUN rm -r /root/python_build
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y --allow-downgrades patchelf \ apt-get install -y --allow-downgrades --allow-change-held-packages \
python3 python3-dev python3-pip \ patchelf python3 python3-dev python3-pip \
git python-pip python-dev python-opencv openssh-server bison \ git python-pip python-dev python-opencv openssh-server bison \
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \ libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
...@@ -172,6 +172,11 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort ...@@ -172,6 +172,11 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
RUN pip3.7 --no-cache-dir install pylint pytest astroid isort RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
RUN pip3 --no-cache-dir install coverage
RUN pip3.6 --no-cache-dir install coverage
RUN pip3.7 --no-cache-dir install coverage
RUN pip --no-cache-dir install coverage
COPY ./python/requirements.txt /root/ COPY ./python/requirements.txt /root/
RUN pip3 --no-cache-dir install -r /root/requirements.txt RUN pip3 --no-cache-dir install -r /root/requirements.txt
RUN pip3.6 --no-cache-dir install -r /root/requirements.txt RUN pip3.6 --no-cache-dir install -r /root/requirements.txt
......
# PaddlePaddle (clone from /baidu/paddlepaddle/paddle@feed-trainer) # PaddlePaddle
Fork From http://icode.baidu.com/repos/baidu/paddlepaddle/paddle/tree/paddle_feed_news_201910 (commitid:f50e701) v1.4
English | [简体中文](./README_cn.md) English | [简体中文](./README_cn.md)
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html) [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
...@@ -18,17 +18,18 @@ learning to many products at Baidu. ...@@ -18,17 +18,18 @@ learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle. Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 1.5.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.5) ### Latest PaddlePaddle Release: [Fluid 1.5.2](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
### Install Latest Stable Release: ### Install Latest Stable Release:
``` ```
# Linux CPU # Linux CPU
pip install paddlepaddle pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda10cudnn7 # Linux GPU cuda10cudnn7
pip install paddlepaddle-gpu==1.5.1.post107 pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7 # Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.5.1.post87 pip install paddlepaddle-gpu==1.5.2.post87
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu==1.5.2.post97
# For installation on other platform, refer to http://paddlepaddle.org/ # For installation on other platform, refer to http://paddlepaddle.org/
``` ```
...@@ -76,33 +77,33 @@ Now our developers could acquire Tesla V100 online computing resources for free. ...@@ -76,33 +77,33 @@ Now our developers could acquire Tesla V100 online computing resources for free.
## Installation ## Installation
It is recommended to read [this doc](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) on our website. It is recommended to read [this doc](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) on our website.
## Documentation ## Documentation
We provide [English](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) and We provide [English](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) and
[Chinese](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) documentation. [Chinese](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) documentation.
- [Deep Learning 101](https://github.com/PaddlePaddle/book) - [Deep Learning 101](https://github.com/PaddlePaddle/book)
You might want to start from this online interactive book that can run in a Jupyter Notebook. You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.4/user_guides/howto/training/multi_node_en.html) - [Distributed Training](http://paddlepaddle.org.cn/documentation/docs/en/1.5/user_guides/howto/training/multi_node_en.html)
You can run distributed training jobs on MPI clusters. You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/docs/en/1.4/api/index_en.html) - [Python API](http://paddlepaddle.org.cn/documentation/docs/en/1.5/api/index_en.html)
Our new API enables much shorter programs. Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.4/advanced_usage/development/contribute_to_paddle/index_en.html) - [How to Contribute](http://paddlepaddle.org.cn/documentation/docs/en/1.5/advanced_usage/development/contribute_to_paddle/index_en.html)
We appreciate your contributions! We appreciate your contributions!
## Communication ## Communication
- [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc. - [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
- QQ discussion group: 432676488 (PaddlePaddle). - QQ discussion group: 796771754 (PaddlePaddle).
- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc. - [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
## Copyright and License ## Copyright and License
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
[English](./README.md) | 简体中文 [English](./README.md) | 简体中文
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html) [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
...@@ -16,17 +16,18 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效 ...@@ -16,17 +16,18 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效
跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
### PaddlePaddle最新版本: [Fluid 1.5.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.5) ### PaddlePaddle最新版本: [Fluid 1.5.2](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
### 安装最新稳定版本: ### 安装最新稳定版本:
``` ```
# Linux CPU # Linux CPU
pip install paddlepaddle pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda10cudnn7 # Linux GPU cuda10cudnn7
pip install paddlepaddle-gpu==1.5.1.post107 pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7 # Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.5.1.post87 pip install paddlepaddle-gpu==1.5.2.post87
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu==1.5.2.post97
# 其他平台上的安装指引请参考 http://paddlepaddle.org/ # 其他平台上的安装指引请参考 http://paddlepaddle.org/
``` ```
...@@ -58,33 +59,33 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型 ...@@ -58,33 +59,33 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型
## 安装 ## 安装
推荐阅读官网上的[安装说明](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) 推荐阅读官网上的[安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html)
## 文档 ## 文档
我们提供[英文](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) 我们提供[英文](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)
[中文](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) 文档 [中文](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) 文档
- [深度学习101](https://github.com/PaddlePaddle/book) - [深度学习101](https://github.com/PaddlePaddle/book)
或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行
- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.4/user_guides/howto/training/multi_node.html) - [分布式训练](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/multi_node.html)
可以在MPI集群上运行分布式训练任务 可以在MPI集群上运行分布式训练任务
- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.4/api_cn/index_cn.html) - [Python API](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/api_cn/index_cn.html)
新的API支持代码更少更简洁的程序 新的API支持代码更少更简洁的程序
- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.4/advanced_usage/development/contribute_to_paddle/index_cn.html) - [贡献方式](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/development/contribute_to_paddle/index_cn.html)
欢迎您的贡献! 欢迎您的贡献!
## 交流与反馈 ## 交流与反馈
- 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议 - 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
- QQ群: 432676488 (PaddlePaddle) - QQ群: 796771754 (PaddlePaddle)
- [论坛](http://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围 - [论坛](http://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
## 版权和许可证 ## 版权和许可证
......
...@@ -62,6 +62,10 @@ if(WITH_PSLIB) ...@@ -62,6 +62,10 @@ if(WITH_PSLIB)
add_definitions(-DPADDLE_WITH_PSLIB) add_definitions(-DPADDLE_WITH_PSLIB)
endif() endif()
if(WITH_BOX_PS)
add_definitions(-DPADDLE_WITH_BOX_PS)
endif()
if(WITH_GPU) if(WITH_GPU)
add_definitions(-DPADDLE_WITH_CUDA) add_definitions(-DPADDLE_WITH_CUDA)
add_definitions(-DEIGEN_USE_GPU) add_definitions(-DEIGEN_USE_GPU)
...@@ -88,14 +92,20 @@ if(WITH_GPU) ...@@ -88,14 +92,20 @@ if(WITH_GPU)
include_directories(${CUDA_TOOLKIT_INCLUDE}) include_directories(${CUDA_TOOLKIT_INCLUDE})
if(TENSORRT_FOUND) if(TENSORRT_FOUND)
if(${CUDA_VERSION_MAJOR} VERSION_LESS 8) if(WIN32)
message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile") if(${CUDA_VERSION_MAJOR} VERSION_LESS 9)
endif() message(FATAL_ERROR "TensorRT needs CUDA >= 9.0 to compile on Windows")
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) endif()
message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile") else()
endif() if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4) message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile") endif()
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
endif()
if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
endif()
endif() endif()
include_directories(${TENSORRT_INCLUDE_DIR}) include_directories(${TENSORRT_INCLUDE_DIR})
endif() endif()
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import shutil
import glob
def main():
src = sys.argv[1]
dst = sys.argv[2]
if os.path.isdir(src): #copy directory
pathList = os.path.split(src)
dst = os.path.join(dst, pathList[-1])
if not os.path.exists(dst):
shutil.copytree(src, dst)
print("first copy directory: {0} --->>> {1}".format(src, dst))
else:
shutil.rmtree(dst)
shutil.copytree(src, dst)
print("overwritten copy directory: {0} --->>> {1}".format(src, dst))
else: #copy file, wildcard
if not os.path.exists(dst):
os.makedirs(dst)
srcFiles = glob.glob(src)
for srcFile in srcFiles:
shutil.copy(srcFile, dst)
print("copy file: {0} --->>> {1}".format(srcFile, dst))
if __name__ == "__main__":
main()
...@@ -186,10 +186,6 @@ list(APPEND CUDA_NVCC_FLAGS "-std=c++11") ...@@ -186,10 +186,6 @@ list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
endif(NOT WIN32) endif(NOT WIN32)
if(WITH_FAST_MATH)
# Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
endif()
# in cuda9, suppress cuda warning on eigen # in cuda9, suppress cuda warning on eigen
list(APPEND CUDA_NVCC_FLAGS "-w") list(APPEND CUDA_NVCC_FLAGS "-w")
# Set :expt-relaxed-constexpr to suppress Eigen warnings # Set :expt-relaxed-constexpr to suppress Eigen warnings
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT ${WITH_BOX_PS})
return()
ENDIF(NOT ${WITH_BOX_PS})
IF(WIN32 OR APPLE)
MESSAGE(WARNING
"Windows or Mac is not supported with BOX_PS in Paddle yet."
"Force WITH_BOX_PS=OFF")
SET(WITH_BOX_PS OFF CACHE STRING "Disable BOX_PS package in Windows and MacOS" FORCE)
return()
ENDIF()
INCLUDE(ExternalProject)
SET(BOX_PS_PROJECT "extern_box_ps")
IF((NOT DEFINED BOX_PS_VER) OR (NOT DEFINED BOX_PS_URL))
MESSAGE(STATUS "use pre defined download url")
SET(BOX_PS_VER "0.1.1" CACHE STRING "" FORCE)
SET(BOX_PS_NAME "box_ps" CACHE STRING "" FORCE)
SET(BOX_PS_URL "http://box-ps.gz.bcebos.com/box_ps_stub.tar.gz" CACHE STRING "" FORCE)
ENDIF()
MESSAGE(STATUS "BOX_PS_NAME: ${BOX_PS_NAME}, BOX_PS_URL: ${BOX_PS_URL}")
SET(BOX_PS_SOURCE_DIR "${THIRD_PARTY_PATH}/box_ps")
SET(BOX_PS_DOWNLOAD_DIR "${BOX_PS_SOURCE_DIR}/src/${BOX_PS_PROJECT}")
SET(BOX_PS_DST_DIR "box_ps")
SET(BOX_PS_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(BOX_PS_INSTALL_DIR ${BOX_PS_INSTALL_ROOT}/${BOX_PS_DST_DIR})
SET(BOX_PS_ROOT ${BOX_PS_INSTALL_DIR})
SET(BOX_PS_INC_DIR ${BOX_PS_ROOT}/include)
SET(BOX_PS_LIB_DIR ${BOX_PS_ROOT}/lib)
SET(BOX_PS_LIB ${BOX_PS_LIB_DIR}/libbox_ps.so)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${BOX_PS_ROOT}/lib")
INCLUDE_DIRECTORIES(${BOX_PS_INC_DIR})
FILE(WRITE ${BOX_PS_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(BOX_PS)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${BOX_PS_NAME}/include ${BOX_PS_NAME}/lib \n"
" DESTINATION ${BOX_PS_DST_DIR})\n")
ExternalProject_Add(
${BOX_PS_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${BOX_PS_SOURCE_DIR}
DOWNLOAD_DIR ${BOX_PS_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${BOX_PS_URL} -c -q -O ${BOX_PS_NAME}.tar.gz
&& tar zxvf ${BOX_PS_NAME}.tar.gz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${BOX_PS_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BOX_PS_INSTALL_ROOT}
)
ADD_LIBRARY(box_ps SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET box_ps PROPERTY IMPORTED_LOCATION ${BOX_PS_LIB})
ADD_DEPENDENCIES(box_ps ${BOX_PS_PROJECT})
...@@ -33,7 +33,7 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr ...@@ -33,7 +33,7 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog") set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF # If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF
ExternalProject_Add( ExternalProject_Add(
...@@ -62,7 +62,7 @@ ExternalProject_Add( ...@@ -62,7 +62,7 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
) )
ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy) ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest)
ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
ADD_DEPENDENCIES(brpc extern_brpc) ADD_DEPENDENCIES(brpc extern_brpc)
......
...@@ -23,14 +23,14 @@ INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR}) ...@@ -23,14 +23,14 @@ INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
ExternalProject_Add( ExternalProject_Add(
extern_dgc extern_dgc
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/PaddlePaddle/Fleet" URL "http://fleet.bj.bcebos.com/collective.tgz"
GIT_TAG "2d04dc3800cdd0601f1b65d547dabcc60b0cf9dc" URL_MD5 "015d565156c3de4e30fe25473f47e7a9"
SOURCE_DIR "${DGC_SOURCES_DIR}" SOURCE_DIR "${DGC_SOURCES_DIR}"
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND cd collective && make -j BUILD_COMMAND make -j
INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/ ${DGC_INCLUDE_DIR}/dgc INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/ ${DGC_INCLUDE_DIR}/dgc
&& cp ${DGC_SOURCES_DIR}/collective/build/lib/libdgc.a ${DGC_LIBRARIES} && cp ${DGC_SOURCES_DIR}/build/lib/libdgc.a ${DGC_LIBRARIES}
&& cp ${DGC_SOURCES_DIR}/collective/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/ && cp ${DGC_SOURCES_DIR}/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/
BUILD_IN_SOURCE 1 BUILD_IN_SOURCE 1
) )
......
...@@ -3,15 +3,6 @@ INCLUDE(ExternalProject) ...@@ -3,15 +3,6 @@ INCLUDE(ExternalProject)
SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
if(NOT WITH_FAST_MATH)
# EIGEN_FAST_MATH: https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html
# enables some optimizations which might affect the accuracy of the result.
# This currently enables the SSE vectorization of sin() and cos(),
# and speedups sqrt() for single precision.
# Defined to 1 by default. Define it to 0 to disable.
add_definitions(-DEIGEN_FAST_MATH=0)
endif()
if(WIN32) if(WIN32)
set(EIGEN_GIT_REPOSITORY https://github.com/wopeizl/eigen-git-mirror) set(EIGEN_GIT_REPOSITORY https://github.com/wopeizl/eigen-git-mirror)
......
...@@ -21,6 +21,8 @@ IF(WIN32) ...@@ -21,6 +21,8 @@ IF(WIN32)
set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
ELSE(WIN32) ELSE(WIN32)
set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
set(BUILD_COMMAND $(MAKE) --silent)
set(INSTALL_COMMAND $(MAKE) install)
ENDIF(WIN32) ENDIF(WIN32)
INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
...@@ -31,6 +33,8 @@ ExternalProject_Add( ...@@ -31,6 +33,8 @@ ExternalProject_Add(
GIT_REPOSITORY "https://github.com/gflags/gflags.git" GIT_REPOSITORY "https://github.com/gflags/gflags.git"
GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a
PREFIX ${GFLAGS_SOURCES_DIR} PREFIX ${GFLAGS_SOURCES_DIR}
BUILD_COMMAND ${BUILD_COMMAND}
INSTALL_COMMAND ${INSTALL_COMMAND}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
...@@ -50,6 +54,7 @@ ExternalProject_Add( ...@@ -50,6 +54,7 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
) )
ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
ADD_DEPENDENCIES(gflags extern_gflags) ADD_DEPENDENCIES(gflags extern_gflags)
......
...@@ -13,6 +13,9 @@ ...@@ -13,6 +13,9 @@
# limitations under the License. # limitations under the License.
#FIXME:(gongwb) Move brpc's gtest dependency. #FIXME:(gongwb) Move brpc's gtest dependency.
include(GNUInstallDirs)
IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
IF(WITH_TESTING) IF(WITH_TESTING)
ENABLE_TESTING() ENABLE_TESTING()
...@@ -28,14 +31,14 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) ...@@ -28,14 +31,14 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
IF(WIN32) IF(WIN32)
set(GTEST_LIBRARIES set(GTEST_LIBRARIES
"${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE) "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
set(GTEST_MAIN_LIBRARIES set(GTEST_MAIN_LIBRARIES
"${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE) "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
ELSE(WIN32) ELSE(WIN32)
set(GTEST_LIBRARIES set(GTEST_LIBRARIES
"${GTEST_INSTALL_DIR}/lib/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE) "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
set(GTEST_MAIN_LIBRARIES set(GTEST_MAIN_LIBRARIES
"${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE) "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
ENDIF(WIN32) ENDIF(WIN32)
IF(WITH_MKLML) IF(WITH_MKLML)
...@@ -48,7 +51,7 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) ...@@ -48,7 +51,7 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${GTEST_DEPENDS} DEPENDS ${GTEST_DEPENDS}
GIT_REPOSITORY "https://github.com/google/googletest.git" GIT_REPOSITORY "https://github.com/google/googletest.git"
GIT_TAG "release-1.8.0" GIT_TAG "release-1.8.1"
PREFIX ${GTEST_SOURCES_DIR} PREFIX ${GTEST_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
...@@ -34,8 +34,6 @@ ExternalProject_Add( ...@@ -34,8 +34,6 @@ ExternalProject_Add(
BUILD_IN_SOURCE 1 BUILD_IN_SOURCE 1
) )
ADD_DEPENDENCIES(extern_leveldb snappy)
ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL) ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES}) SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
ADD_DEPENDENCIES(leveldb extern_leveldb) ADD_DEPENDENCIES(leveldb extern_leveldb)
...@@ -43,7 +43,7 @@ IF(WIN32) ...@@ -43,7 +43,7 @@ IF(WIN32)
ELSE() ELSE()
#TODO(intel-huying): #TODO(intel-huying):
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later. # Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) SET(MKLML_VER "csrmm2_mklml_lnx_2019.0.2" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
......
...@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs) ...@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph") SET(NGRAPH_PROJECT "extern_ngraph")
SET(NGRAPH_GIT_TAG "4ec94acc11084a5d53418f565529310fa584899a") SET(NGRAPH_GIT_TAG "e26d602a756f5f83e6c8220f910b61d7089fa951")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
...@@ -76,6 +76,7 @@ ExternalProject_Add( ...@@ -76,6 +76,7 @@ ExternalProject_Add(
CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR} CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}
CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
CMAKE_ARGS -NGRAPH_USE_LEGACY_MKLDNN=TRUE
) )
add_dependencies(ngraph ${NGRAPH_PROJECT}) add_dependencies(ngraph ${NGRAPH_PROJECT})
......
...@@ -58,7 +58,41 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -58,7 +58,41 @@ IF(NOT ${CBLAS_FOUND})
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
) )
ELSE() ELSE(NOT WIN32)
SET(CBLAS_FOUND false)
SET(CBLAS_LIBRARIES
"${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}/openblas) # For openbals code to include its own headers.
INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install)
ExternalProject_Add(
extern_openblas
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git
GIT_TAG "v0.3.7"
PREFIX ${CBLAS_SOURCES_DIR}
INSTALL_DIR ${CBLAS_INSTALL_DIR}
BUILD_IN_SOURCE 0
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DBUILD_SHARED_LIBS=ON
-DMSVC_STATIC_CRT=${MSVC_STATIC_CRT}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
add_custom_command(TARGET extern_openblas POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX} ${CBLAS_INSTALL_DIR}/lib )
ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES})
ADD_DEPENDENCIES(openblas extern_openblas)
ENDIF(NOT WIN32) ENDIF(NOT WIN32)
SET(CBLAS_PROVIDER openblas) SET(CBLAS_PROVIDER openblas)
ENDIF(NOT ${CBLAS_FOUND}) ENDIF(NOT ${CBLAS_FOUND})
......
...@@ -222,6 +222,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -222,6 +222,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=lib -DCMAKE_INSTALL_LIBDIR=lib
-DBUILD_SHARED_LIBS=OFF -DBUILD_SHARED_LIBS=OFF
-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}
CMAKE_CACHE_ARGS CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include (ExternalProject)
# NOTE: snappy is needed when linking with recordio
set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
if(WIN32)
SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
else()
SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
endif()
ExternalProject_Add(
extern_snappy
GIT_REPOSITORY "https://github.com/google/snappy"
GIT_TAG "1.1.7"
PREFIX ${SNAPPY_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF
-DSNAPPY_BUILD_TESTS:BOOL=OFF
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
IF(WIN32)
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
else(WIN32)
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
endif (WIN32)
add_library(snappy STATIC IMPORTED GLOBAL)
set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
include_directories(${SNAPPY_INCLUDE_DIR})
add_dependencies(snappy extern_snappy)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include (ExternalProject)
set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
if(WIN32)
# Fix me, VS2015 come without VLA support
set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib")
MESSAGE(WARNING, "In windows, snappystream has no compile support for windows,
please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR})
else(WIN32)
set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
ExternalProject_Add(
extern_snappystream
GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
GIT_TAG "0.2.8"
PREFIX ${SNAPPYSTREAM_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
DEPENDS snappy
)
endif(WIN32)
add_library(snappystream STATIC IMPORTED GLOBAL)
set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
add_dependencies(snappystream extern_snappystream)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include (ExternalProject)
IF(NOT ${WITH_CUSTOM_TRAINER})
return()
ENDIF(NOT ${WITH_CUSTOM_TRAINER})
set(YAML_SOURCES_DIR ${THIRD_PARTY_PATH}/yaml-cpp)
set(YAML_INSTALL_DIR ${THIRD_PARTY_PATH}/install/yaml-cpp)
set(YAML_INCLUDE_DIR "${YAML_INSTALL_DIR}/include" CACHE PATH "yaml include directory." FORCE)
SET(YAML_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
ExternalProject_Add(
extern_yaml
GIT_REPOSITORY "https://github.com/jbeder/yaml-cpp"
GIT_TAG "yaml-cpp-0.6.2"
PREFIX ${YAML_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${YAML_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${YAML_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${YAML_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF
-DYAML_BUILD_TESTS:BOOL=OFF
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${YAML_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${YAML_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
set(YAML_LIBRARIES "${YAML_INSTALL_DIR}/lib/libyaml-cpp.a")
add_library(yaml-cpp STATIC IMPORTED GLOBAL)
set_property(TARGET yaml-cpp PROPERTY IMPORTED_LOCATION ${YAML_LIBRARIES})
include_directories(${YAML_INCLUDE_DIR})
add_dependencies(yaml-cpp extern_yaml)
...@@ -37,6 +37,12 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") ...@@ -37,6 +37,12 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
function(safe_set_flag is_c src_list flag_name) function(safe_set_flag is_c src_list flag_name)
string(REPLACE "-" "_" safe_name ${flag_name}) string(REPLACE "-" "_" safe_name ${flag_name})
string(REPLACE "=" "_" safe_name ${safe_name}) string(REPLACE "=" "_" safe_name ${safe_name})
if(${flag_name} MATCHES "fsanitize")
set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS ${flag_name})
endif()
if(is_c) if(is_c)
CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name}) CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name}) set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
...@@ -47,6 +53,10 @@ function(safe_set_flag is_c src_list flag_name) ...@@ -47,6 +53,10 @@ function(safe_set_flag is_c src_list flag_name)
if(${safe_name}) if(${safe_name})
set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE) set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE)
endif() endif()
if(${flag_name} MATCHES "fsanitize")
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
endif()
endfunction() endfunction()
# helper macro to set cflag # helper macro to set cflag
...@@ -108,6 +118,20 @@ if(BARRIER_FOUND) ...@@ -108,6 +118,20 @@ if(BARRIER_FOUND)
endif(BARRIER_FOUND) endif(BARRIER_FOUND)
SET(CMAKE_EXTRA_INCLUDE_FILES "") SET(CMAKE_EXTRA_INCLUDE_FILES "")
# Only one sanitizer is allowed in compile time
string(TOLOWER "${SANITIZER_TYPE}" sanitizer_type)
if(sanitizer_type STREQUAL "address")
set(fsanitize "-fsanitize=address")
elseif(sanitizer_type STREQUAL "leak")
set(fsanitize "-fsanitize=leak")
elseif(sanitizer_type STREQUAL "memory")
set(fsanitize "-fsanitize=memory")
elseif(sanitizer_type STREQUAL "thread")
set(fsanitize "-fsanitize=thread")
elseif(sanitizer_type STREQUAL "undefined")
set(fsanitize "-fsanitize=undefined")
endif()
# Common flags. the compiler flag used for C/C++ sources whenever release or debug # Common flags. the compiler flag used for C/C++ sources whenever release or debug
# Do not care if this flag is support for gcc. # Do not care if this flag is support for gcc.
...@@ -131,7 +155,7 @@ set(COMMON_FLAGS ...@@ -131,7 +155,7 @@ set(COMMON_FLAGS
-Wno-error=terminate # Warning in PADDLE_ENFORCE -Wno-error=terminate # Warning in PADDLE_ENFORCE
-Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2 -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
-Wimplicit-fallthrough=0 # Warning in tinyformat.h -Wimplicit-fallthrough=0 # Warning in tinyformat.h
-Wno-error=maybe-uninitialized # Warning in boost gcc 7.2 ${fsanitize}
) )
set(GPU_COMMON_FLAGS set(GPU_COMMON_FLAGS
...@@ -173,14 +197,13 @@ endif(UNIX AND NOT APPLE) ...@@ -173,14 +197,13 @@ endif(UNIX AND NOT APPLE)
foreach(flag ${COMMON_FLAGS}) foreach(flag ${COMMON_FLAGS})
safe_set_cflag(CMAKE_C_FLAGS ${flag}) safe_set_cflag(CMAKE_C_FLAGS ${flag})
safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
endforeach() endforeach()
foreach(flag ${GPU_COMMON_FLAGS}) foreach(flag ${GPU_COMMON_FLAGS})
safe_set_nvflag(${flag}) safe_set_nvflag(${flag})
endforeach() endforeach()
if(WIN32) if(WIN32 AND MSVC_STATIC_CRT)
# windows build turn off warnings. # windows build turn off warnings.
safe_set_static_flag() safe_set_static_flag()
foreach(flag_var foreach(flag_var
...@@ -191,4 +214,4 @@ safe_set_static_flag() ...@@ -191,4 +214,4 @@ safe_set_static_flag()
string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}") string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
set(flag_var "${flag_var} /w") set(flag_var "${flag_var} /w")
endforeach(flag_var) endforeach(flag_var)
endif(WIN32) endif()
...@@ -389,7 +389,6 @@ function(cc_test_run TARGET_NAME) ...@@ -389,7 +389,6 @@ function(cc_test_run TARGET_NAME)
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
# No unit test should exceed 10 minutes. # No unit test should exceed 10 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
...@@ -472,7 +471,6 @@ function(nv_test TARGET_NAME) ...@@ -472,7 +471,6 @@ function(nv_test TARGET_NAME)
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif() endif()
endfunction(nv_test) endfunction(nv_test)
...@@ -725,7 +723,7 @@ function(py_test TARGET_NAME) ...@@ -725,7 +723,7 @@ function(py_test TARGET_NAME)
if(WITH_COVERAGE) if(WITH_COVERAGE)
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G FLAGS_cpu_deterministic=true
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
${PYTHON_EXECUTABLE} -m coverage run --branch -p ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -m coverage run --branch -p ${py_test_SRCS} ${py_test_ARGS}
...@@ -733,7 +731,7 @@ function(py_test TARGET_NAME) ...@@ -733,7 +731,7 @@ function(py_test TARGET_NAME)
else() else()
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G FLAGS_cpu_deterministic=true
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
......
...@@ -13,12 +13,19 @@ ...@@ -13,12 +13,19 @@
# limitations under the License. # limitations under the License.
# make package for paddle fluid shared and static library # make package for paddle fluid shared and static library
if(WIN32)
if(NOT PYTHON_EXECUTABLE)
FIND_PACKAGE(PythonInterp REQUIRED)
endif()
endif()
set(COPY_SCRIPT_DIR ${PADDLE_SOURCE_DIR}/cmake)
function(copy TARGET) function(copy TARGET)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DSTS DEPS) set(multiValueArgs SRCS DSTS)
cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE)
list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
...@@ -26,43 +33,16 @@ function(copy TARGET) ...@@ -26,43 +33,16 @@ function(copy TARGET)
message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers") message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
endif () endif ()
math(EXPR len "${copy_lib_SRCS_len} - 1") math(EXPR len "${copy_lib_SRCS_len} - 1")
add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
foreach (index RANGE ${len}) foreach (index RANGE ${len})
list(GET copy_lib_SRCS ${index} src) list(GET copy_lib_SRCS ${index} src)
list(GET copy_lib_DSTS ${index} dst) list(GET copy_lib_DSTS ${index} dst)
if (WIN32) if (WIN32) #windows
if(IS_DIRECTORY ${src}) file(TO_NATIVE_PATH ${src} native_src)
get_filename_component(last_path ${src} NAME) file(TO_NATIVE_PATH ${dst} native_dst)
string(APPEND dst "/" ${last_path}) add_custom_command(TARGET ${TARGET} POST_BUILD
add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND ${PYTHON_EXECUTABLE} ${COPY_SCRIPT_DIR}/copyfile.py ${native_src} ${native_dst})
COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" else (WIN32) #not windows
) add_custom_command(TARGET ${TARGET} POST_BUILD
if(EXISTS ${src})
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND cmake -E copy_directory "${src}" "${dst}"
COMMENT "copying ${src} -> ${dst}")
else()
message(WARNING "${src} not exist!")
endif()
else()
# windows cmd shell will not expand wildcard automatically.
# below expand the files, and copy them by rules.
file(GLOB src_files ${src})
if (NOT "${src_files}" STREQUAL "")
list(REMOVE_DUPLICATES src_files)
endif ()
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
)
foreach (src_file ${src_files})
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
COMMENT "copying ${src_file} -> ${dst}")
endforeach ()
endif()
else (WIN32) # not windows
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND mkdir -p "${dst}" COMMAND mkdir -p "${dst}"
COMMAND cp -r "${src}" "${dst}" COMMAND cp -r "${src}" "${dst}"
COMMENT "copying ${src} -> ${dst}") COMMENT "copying ${src} -> ${dst}")
...@@ -71,210 +51,189 @@ function(copy TARGET) ...@@ -71,210 +51,189 @@ function(copy TARGET)
endfunction() endfunction()
# third party # third party
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3") set(third_party_deps eigen3 gflags glog boost xxhash zlib)
copy(eigen3_lib if(NOT PROTOBUF_FOUND OR WIN32)
SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen list(APPEND third_party_deps extern_protobuf)
DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported endif ()
DEPS eigen3
)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags") if (WITH_MKLML)
copy(gflags_lib list(APPEND third_party_deps mklml)
SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} elseif (NOT CBLAS_FOUND OR WIN32)
DSTS ${dst_dir} ${dst_dir}/lib list(APPEND third_party_deps extern_openblas)
DEPS gflags endif ()
)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog") if (WITH_MKLDNN)
copy(glog_lib list(APPEND third_party_deps mkldnn_shared_lib)
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} endif ()
DSTS ${dst_dir} ${dst_dir}/lib
DEPS glog
)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/") if (WITH_NGRAPH)
copy(boost_lib list(APPEND third_party_deps ngraph)
SRCS ${BOOST_INCLUDE_DIR}/boost endif ()
DSTS ${dst_dir}
DEPS boost
)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash") add_custom_target(third_party DEPENDS ${third_party_deps})
copy(xxhash_lib
SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS xxhash
)
if (NOT PROTOBUF_FOUND OR WIN32) # inference-only library
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") set(inference_lib_deps third_party paddle_fluid paddle_fluid_shared)
copy(protobuf_lib add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps})
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS extern_protobuf
)
endif ()
if (WITH_MKLML) set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/eigen3")
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml") copy(inference_lib_dist
SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported)
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/boost")
copy(inference_lib_dist
SRCS ${BOOST_INCLUDE_DIR}/boost
DSTS ${dst_dir})
if(WITH_MKLML)
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/mklml")
if(WIN32) if(WIN32)
copy(mklml_lib copy(inference_lib_dist
SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB} SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB}
${MKLML_SHARED_LIB_DEPS} ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR} ${MKLML_SHARED_LIB_DEPS} ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR}
DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib
${dst_dir}/lib ${dst_dir}/lib ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib ${dst_dir})
DEPS mklml
)
else() else()
copy(mklml_lib copy(inference_lib_dist
SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir} DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir})
DEPS mklml
)
endif() endif()
elseif (NOT CBLAS_FOUND OR WIN32) elseif (NOT CBLAS_FOUND OR WIN32)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas") set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/openblas")
copy(openblas_lib copy(inference_lib_dist
SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
DSTS ${dst_dir} ${dst_dir} DSTS ${dst_dir} ${dst_dir})
DEPS extern_openblas
)
endif () endif ()
if (WITH_MKLDNN) if(WITH_MKLDNN)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn") set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/mkldnn")
if(WIN32) if(WIN32)
copy(mkldnn_lib copy(inference_lib_dist
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB} SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB}
DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib)
DEPS mkldnn_shared_lib else()
) copy(inference_lib_dist
else() SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
copy(mkldnn_lib DSTS ${dst_dir} ${dst_dir}/lib)
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} endif()
DSTS ${dst_dir} ${dst_dir}/lib endif()
DEPS mkldnn_shared_lib
) set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/gflags")
endif() copy(inference_lib_dist
endif () SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
if (WITH_NGRAPH)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph")
copy(ngraph_lib
SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
DSTS ${dst_dir} ${dst_dir}
DEPS ngraph
)
endif ()
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/glog")
copy(snappy_lib copy(inference_lib_dist
SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib)
DEPS snappy)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/xxhash")
copy(snappystream_lib copy(inference_lib_dist
SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib)
DEPS snappystream)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/zlib")
copy(zlib_lib copy(inference_lib_dist
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib)
DEPS zlib)
# paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
set(module "framework")
if (NOT WIN32)
set(framework_lib_deps framework_py_proto)
endif (NOT WIN32)
copy(framework_lib DEPS ${framework_lib_deps}
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet
)
set(module "memory") if (NOT PROTOBUF_FOUND OR WIN32)
copy(memory_lib set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/protobuf")
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h copy(inference_lib_dist
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
) DSTS ${dst_dir} ${dst_dir}/lib)
endif ()
set(inference_deps paddle_fluid_shared paddle_fluid)
set(module "inference/api") if (WITH_NGRAPH)
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/ngraph")
copy(inference_lib_dist
SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
DSTS ${dst_dir} ${dst_dir})
endif ()
if (TENSORRT_FOUND) if (TENSORRT_FOUND)
copy(tensorrt_lib DEPS ${inference_deps} set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/tensorrt")
SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer* copy(inference_lib_dist
DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib) SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/*nvinfer*
DSTS ${dst_dir}/include ${dst_dir}/lib)
endif () endif ()
if (ANAKIN_FOUND) if (ANAKIN_FOUND)
copy(anakin_lib DEPS ${inference_deps} set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/anakin")
copy(inference_lib_dist
SRCS ${ANAKIN_ROOT}/* SRCS ${ANAKIN_ROOT}/*
DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin) DSTS ${dst_dir})
endif () endif ()
set(module "inference") copy(inference_lib_dist
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INFERENCE_INSTALL_DIR})
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
if(WIN32) if(WIN32)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*) set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*)
else(WIN32) else(WIN32)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*) set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
endif(WIN32) endif(WIN32)
copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${paddle_fluid_lib} copy(inference_lib_dist
${src_dir}/${module}/api/paddle_*.h SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib)
# fluid library for both train and inference
set(fluid_lib_deps inference_lib_dist)
add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})
set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
set(module "inference")
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
)
set(module "framework")
set(framework_lib_deps framework_proto)
add_dependencies(fluid_lib_dist ${framework_lib_deps})
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet)
set(module "memory")
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation
) )
set(module "platform") set(module "platform")
copy(platform_lib DEPS profiler_py_proto set(platform_lib_deps profiler_proto)
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h add_dependencies(fluid_lib_dist ${platform_lib_deps})
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
) )
set(module "string") set(module "string")
copy(string_lib copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
) )
set(module "pybind") set(module "pybind")
copy(pybind_lib copy(fluid_lib_dist
SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
DSTS ${dst_dir}/${module} DSTS ${dst_dir}/${module}
) )
# CMakeCache Info # CMakeCache Info
copy(cmake_cache copy(fluid_lib_dist
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INSTALL_DIR}) DSTS ${FLUID_INSTALL_DIR} ${FLUID_INSTALL_DIR}
# This command generates a complete fluid library for both train and inference
add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
# Following commands generate a inference-only fluid library
# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
copy(third_party DEPS fluid_lib_dist
SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
) )
# only need libpaddle_fluid.so/a and paddle_*.h for inference-only library
copy(inference_api_lib DEPS fluid_lib_dist
SRCS ${paddle_fluid_lib}
${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
)
add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)
# paddle fluid version # paddle fluid version
function(version version_file) function(version version_file)
execute_process( execute_process(
......
...@@ -110,7 +110,7 @@ function(op_library TARGET) ...@@ -110,7 +110,7 @@ function(op_library TARGET)
# Define operators that don't need pybind here. # Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "deformable_conv_op" "dgc_op") "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
...@@ -191,9 +191,6 @@ function(op_library TARGET) ...@@ -191,9 +191,6 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n") file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
elseif(${TARGET} STREQUAL "tensorrt_engine_op") elseif(${TARGET} STREQUAL "tensorrt_engine_op")
message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference") message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
elseif(${TARGET} STREQUAL "fc")
# HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
else() else()
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
endif() endif()
......
set(CPACK_PACKAGE_NAME paddle)
set(CPACK_PACKAGE_VERSION_MAJOR ${PADDLE_MAJOR_VERSION})
set(CPACK_PACKAGE_VERSION_MINOR ${PADDLE_MINOR_VERSION})
set(CPACK_PACKAGE_VERSION_PATCH ${PADDLE_PATCH_VERSION})
set(CPACK_PACKAGE_VERSION ${PADDLE_VERSION})
## DEB Settings
set(CPACK_DEBIAN_PACKAGE_NAME paddle)
set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE amd64)
set(CPACK_DEBIAN_PACKAGE_MAINTAINER PaddlePaddle Dev <paddle-dev@baidu.com>)
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Paddle")
set(CPACK_PACKAGE_DESCRIPTION "")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl")
set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION})
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst")
#set(CPACK_GENERATOR "DEB")
# Start cpack
include (CMakePackageConfigHelpers)
include (CPack)
...@@ -2,14 +2,28 @@ if(NOT WITH_GPU) ...@@ -2,14 +2,28 @@ if(NOT WITH_GPU)
return() return()
endif() endif()
set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT") if(WIN32)
if("${TENSORRT_ROOT}" STREQUAL "")
message(WARNING "Please specify the TensorRT root path: TENSORRT_ROOT.")
endif()
string(REPLACE "\\" "/" TENSORRT_ROOT "${TENSORRT_ROOT}")
set(TR_INFER_LIB nvinfer.lib)
set(TR_INFER_RT nvinfer.dll)
set(TR_INFER_PLUGIN_RT nvinfer_plugin.dll)
else()
set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
set(TR_INFER_LIB libnvinfer.a)
set(TR_INFER_RT libnvinfer.so)
set(TR_INFER_PLUGIN_RT libnvinfer_plugin.so)
endif()
find_path(TENSORRT_INCLUDE_DIR NvInfer.h find_path(TENSORRT_INCLUDE_DIR NvInfer.h
PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
$ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
NO_DEFAULT_PATH NO_DEFAULT_PATH
) )
find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a find_library(TENSORRT_LIBRARY NAMES ${TR_INFER_LIB} ${TR_INFER_RT}
PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
$ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
NO_DEFAULT_PATH NO_DEFAULT_PATH
......
此差异已折叠。
...@@ -4,7 +4,6 @@ add_subdirectory(framework) ...@@ -4,7 +4,6 @@ add_subdirectory(framework)
add_subdirectory(imperative) add_subdirectory(imperative)
add_subdirectory(operators) add_subdirectory(operators)
add_subdirectory(string) add_subdirectory(string)
add_subdirectory(recordio)
add_subdirectory(pybind) add_subdirectory(pybind)
# NOTE: please add subdirectory inference at last. # NOTE: please add subdirectory inference at last.
......
...@@ -63,7 +63,7 @@ if(WITH_GPU) ...@@ -63,7 +63,7 @@ if(WITH_GPU)
else() else()
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
endif() endif()
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
...@@ -123,8 +123,8 @@ cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_co ...@@ -123,8 +123,8 @@ cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_co
cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context) cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place) cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog data_feed_proto
shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type) shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
...@@ -133,7 +133,9 @@ cc_test(version_test SRCS version_test.cc DEPS version) ...@@ -133,7 +133,9 @@ cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc memory_optimize_helper) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
...@@ -193,18 +195,17 @@ else() ...@@ -193,18 +195,17 @@ else()
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif() endif()
target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper) target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper conditional_block_op_helper)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor
graph build_strategy graph build_strategy
fast_threaded_ssa_graph_executor variable_helper) fast_threaded_ssa_graph_executor variable_helper)
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto boost)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
proto_desc) proto_desc)
cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS inplace_op_pass op_registry proto_desc op_info memory_optimize_helper pass_builder)
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
...@@ -222,6 +223,9 @@ endif (NOT WIN32) ...@@ -222,6 +223,9 @@ endif (NOT WIN32)
cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
cc_library(op_compatible_info SRCS op_compatible_info DEPS string_helper)
cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatible_info string_helper glog)
# Get the current working branch # Get the current working branch
execute_process( execute_process(
COMMAND git rev-parse --abbrev-ref HEAD COMMAND git rev-parse --abbrev-ref HEAD
......
...@@ -168,10 +168,10 @@ class ArchiveBase { ...@@ -168,10 +168,10 @@ class ArchiveBase {
#else #else
if (newsize > Capacity()) { if (newsize > Capacity()) {
#endif #endif
Reserve(std::max(Capacity() * 2, newsize)); Reserve((std::max)(Capacity() * 2, newsize));
} }
finish_ = buffer_ + newsize; finish_ = buffer_ + newsize;
cursor_ = std::min(cursor_, finish_); cursor_ = (std::min)(cursor_, finish_);
} }
void Reserve(size_t newcap) { void Reserve(size_t newcap) {
...@@ -207,7 +207,7 @@ class ArchiveBase { ...@@ -207,7 +207,7 @@ class ArchiveBase {
#else #else
if (size > size_t(limit_ - finish_)) { if (size > size_t(limit_ - finish_)) {
#endif #endif
Reserve(std::max(Capacity() * 2, Length() + size)); Reserve((std::max)(Capacity() * 2, Length() + size));
} }
} }
...@@ -311,6 +311,18 @@ class Archive<BinaryArchiveType> : public ArchiveBase { ...@@ -311,6 +311,18 @@ class Archive<BinaryArchiveType> : public ArchiveBase {
*this >> x; *this >> x;
return x; return x;
} }
template <class... ARGS>
void Printf(const char* fmt, ARGS&&... args) {
size_t temp = Limit() - Finish();
int len = snprintf(Finish(), temp, fmt, args...);
CHECK(len >= 0); // NOLINT
if ((size_t)len >= temp) {
PrepareWrite(len + 1);
CHECK(snprintf(Finish(), (size_t)len + 1, fmt, args...) == len);
}
AdvanceFinish(len);
}
}; };
template <class AR, class T, size_t N> template <class AR, class T, size_t N>
...@@ -518,11 +530,11 @@ Archive<AR>& operator>>(Archive<AR>& ar, std::tuple<T...>& x) { ...@@ -518,11 +530,11 @@ Archive<AR>& operator>>(Archive<AR>& ar, std::tuple<T...>& x) {
} \ } \
template <class AR, class KEY, class VALUE, class... ARGS> \ template <class AR, class KEY, class VALUE, class... ARGS> \
Archive<AR>& operator>>(Archive<AR>& ar, MAP_TYPE<KEY, VALUE, ARGS...>& p) { \ Archive<AR>& operator>>(Archive<AR>& ar, MAP_TYPE<KEY, VALUE, ARGS...>& p) { \
size_t size = ar.template Get<size_t>(); \ size_t size = ar.template get<size_t>(); \
p.clear(); \ p.clear(); \
RESERVE_STATEMENT; \ RESERVE_STATEMENT; \
for (size_t i = 0; i < size; i++) { \ for (size_t i = 0; i < size; i++) { \
p.insert(ar.template Get<std::pair<KEY, VALUE>>()); \ p.insert(ar.template get<std::pair<KEY, VALUE>>()); \
} \ } \
return ar; \ return ar; \
} }
...@@ -539,11 +551,11 @@ Archive<AR>& operator>>(Archive<AR>& ar, std::tuple<T...>& x) { ...@@ -539,11 +551,11 @@ Archive<AR>& operator>>(Archive<AR>& ar, std::tuple<T...>& x) {
} \ } \
template <class AR, class KEY, class VALUE, class... ARGS> \ template <class AR, class KEY, class VALUE, class... ARGS> \
Archive<AR>& operator>>(Archive<AR>& ar, MAP_TYPE<KEY, VALUE, ARGS...>& p) { \ Archive<AR>& operator>>(Archive<AR>& ar, MAP_TYPE<KEY, VALUE, ARGS...>& p) { \
size_t size = ar.template Get<uint64_t>(); \ size_t size = ar.template get<uint64_t>(); \
p.clear(); \ p.clear(); \
RESERVE_STATEMENT; \ RESERVE_STATEMENT; \
for (size_t i = 0; i < size; i++) { \ for (size_t i = 0; i < size; i++) { \
p.insert(ar.template Get<std::pair<KEY, VALUE>>()); \ p.insert(ar.template get<std::pair<KEY, VALUE>>()); \
} \ } \
return ar; \ return ar; \
} }
...@@ -568,11 +580,11 @@ ARCHIVE_REPEAT(std::unordered_multimap, p.reserve(size)) ...@@ -568,11 +580,11 @@ ARCHIVE_REPEAT(std::unordered_multimap, p.reserve(size))
} \ } \
template <class AR, class KEY, class... ARGS> \ template <class AR, class KEY, class... ARGS> \
Archive<AR>& operator>>(Archive<AR>& ar, SET_TYPE<KEY, ARGS...>& p) { \ Archive<AR>& operator>>(Archive<AR>& ar, SET_TYPE<KEY, ARGS...>& p) { \
size_t size = ar.template Get<size_t>(); \ size_t size = ar.template get<size_t>(); \
p.clear(); \ p.clear(); \
RESERVE_STATEMENT; \ RESERVE_STATEMENT; \
for (size_t i = 0; i < size; i++) { \ for (size_t i = 0; i < size; i++) { \
p.insert(ar.template Get<KEY>()); \ p.insert(ar.template get<KEY>()); \
} \ } \
return ar; \ return ar; \
} }
...@@ -588,11 +600,11 @@ ARCHIVE_REPEAT(std::unordered_multimap, p.reserve(size)) ...@@ -588,11 +600,11 @@ ARCHIVE_REPEAT(std::unordered_multimap, p.reserve(size))
} \ } \
template <class AR, class KEY, class... ARGS> \ template <class AR, class KEY, class... ARGS> \
Archive<AR>& operator>>(Archive<AR>& ar, SET_TYPE<KEY, ARGS...>& p) { \ Archive<AR>& operator>>(Archive<AR>& ar, SET_TYPE<KEY, ARGS...>& p) { \
size_t size = ar.template Get<uint64_t>(); \ size_t size = ar.template get<uint64_t>(); \
p.clear(); \ p.clear(); \
RESERVE_STATEMENT; \ RESERVE_STATEMENT; \
for (size_t i = 0; i < size; i++) { \ for (size_t i = 0; i < size; i++) { \
p.insert(ar.template Get<KEY>()); \ p.insert(ar.template get<KEY>()); \
} \ } \
return ar; \ return ar; \
} }
......
...@@ -40,7 +40,7 @@ class ChannelObject { ...@@ -40,7 +40,7 @@ class ChannelObject {
// capacity can be zero // capacity can be zero
explicit ChannelObject(size_t capacity) { explicit ChannelObject(size_t capacity) {
capacity_ = std::min(MaxCapacity(), capacity); capacity_ = (std::min)(MaxCapacity(), capacity);
} }
void Clear() { void Clear() {
...@@ -192,7 +192,7 @@ class ChannelObject { ...@@ -192,7 +192,7 @@ class ChannelObject {
std::condition_variable full_cond_; std::condition_variable full_cond_;
static constexpr size_t MaxCapacity() { static constexpr size_t MaxCapacity() {
return std::numeric_limits<size_t>::max() / 2; return (std::numeric_limits<size_t>::max)() / 2;
} }
void Notify() { void Notify() {
...@@ -289,7 +289,7 @@ template <class T> ...@@ -289,7 +289,7 @@ template <class T>
using Channel = std::shared_ptr<ChannelObject<T>>; using Channel = std::shared_ptr<ChannelObject<T>>;
template <class T> template <class T>
Channel<T> MakeChannel(size_t capacity = std::numeric_limits<size_t>::max()) { Channel<T> MakeChannel(size_t capacity = (std::numeric_limits<size_t>::max)()) {
return std::make_shared<ChannelObject<T>>(capacity); return std::make_shared<ChannelObject<T>>(capacity);
} }
...@@ -332,7 +332,7 @@ class ChannelReader { ...@@ -332,7 +332,7 @@ class ChannelReader {
} }
if (cursor_ >= buffer_.size()) { if (cursor_ >= buffer_.size()) {
cursor_ = 0; cursor_ = 0;
if (channel_->Read(buffer_) == 0) { if (channel_->read(buffer_) == 0) {
failed_ = true; failed_ = true;
return *this; return *this;
} }
...@@ -370,7 +370,7 @@ class ChannelWriter { ...@@ -370,7 +370,7 @@ class ChannelWriter {
void Reset(ChannelObject<T>* channel) { void Reset(ChannelObject<T>* channel) {
CHECK(buffer_.empty()) << "Forgot to flush"; CHECK(buffer_.empty()) << "Forgot to flush";
CHECK(channel != nullptr) << "Channel can not be nullptr"; // CHECK(channel != nullptr) << "Channel can not be nullptr";
channel_ = channel; channel_ = channel;
buffer_.clear(); buffer_.clear();
failed_ = !channel; failed_ = !channel;
......
#pragma once
#include <string>
namespace paddle {
namespace framework {
static std::string paddle_commit() {
return "95c1816ec0";
}
static std::string paddle_compile_branch() {
return "develop";
}
static std::string paddle_version() {
return "0.0.0";
}
} // namespace framework
} // namespace paddle
...@@ -33,11 +33,53 @@ limitations under the License. */ ...@@ -33,11 +33,53 @@ limitations under the License. */
#include "io/shell.h" #include "io/shell.h"
#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/platform/timer.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
void RecordCandidateList::ReSize(size_t length) {
_mutex.lock();
_capacity = length;
CHECK(_capacity > 0); // NOLINT
_candidate_list.clear();
_candidate_list.resize(_capacity);
_full = false;
_cur_size = 0;
_total_size = 0;
_mutex.unlock();
}
void RecordCandidateList::ReInit() {
_mutex.lock();
_full = false;
_cur_size = 0;
_total_size = 0;
_mutex.unlock();
}
void RecordCandidateList::AddAndGet(const Record& record,
RecordCandidate* result) {
_mutex.lock();
size_t index = 0;
++_total_size;
auto fleet_ptr = FleetWrapper::GetInstance();
if (!_full) {
_candidate_list[_cur_size++] = record;
_full = (_cur_size == _capacity);
} else {
CHECK(_cur_size == _capacity);
index = fleet_ptr->LocalRandomEngine()() % _total_size;
if (index < _capacity) {
_candidate_list[index] = record;
}
}
index = fleet_ptr->LocalRandomEngine()() % _cur_size;
*result = _candidate_list[index];
_mutex.unlock();
}
void DataFeed::AddFeedVar(Variable* var, const std::string& name) { void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
CheckInit(); CheckInit();
for (size_t i = 0; i < use_slots_.size(); ++i) { for (size_t i = 0; i < use_slots_.size(); ++i) {
...@@ -101,11 +143,24 @@ void DataFeed::AssignFeedVar(const Scope& scope) { ...@@ -101,11 +143,24 @@ void DataFeed::AssignFeedVar(const Scope& scope) {
} }
} }
void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
if (platform::is_cpu_place(this->place_)) {
memcpy(dst, src, size);
} else {
#ifdef PADDLE_WITH_CUDA
cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
#else
PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
#endif
}
}
template <typename T> template <typename T>
void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) { void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size); PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
queue_size_ = queue_size; queue_size_ = queue_size;
queue_ = paddle::framework::MakeChannel<T>(); queue_ = paddle::framework::MakeChannel<T>();
queue_->SetCapacity(queue_size);
} }
template <typename T> template <typename T>
...@@ -169,6 +224,7 @@ InMemoryDataFeed<T>::InMemoryDataFeed() { ...@@ -169,6 +224,7 @@ InMemoryDataFeed<T>::InMemoryDataFeed() {
this->thread_id_ = 0; this->thread_id_ = 0;
this->thread_num_ = 1; this->thread_num_ = 1;
this->parse_ins_id_ = false; this->parse_ins_id_ = false;
this->parse_content_ = false;
this->input_channel_ = nullptr; this->input_channel_ = nullptr;
this->output_channel_ = nullptr; this->output_channel_ = nullptr;
this->consume_channel_ = nullptr; this->consume_channel_ = nullptr;
...@@ -252,6 +308,11 @@ void InMemoryDataFeed<T>::SetThreadNum(int thread_num) { ...@@ -252,6 +308,11 @@ void InMemoryDataFeed<T>::SetThreadNum(int thread_num) {
thread_num_ = thread_num; thread_num_ = thread_num;
} }
template <typename T>
void InMemoryDataFeed<T>::SetParseContent(bool parse_content) {
parse_content_ = parse_content;
}
template <typename T> template <typename T>
void InMemoryDataFeed<T>::SetParseInsId(bool parse_ins_id) { void InMemoryDataFeed<T>::SetParseInsId(bool parse_ins_id) {
parse_ins_id_ = parse_ins_id; parse_ins_id_ = parse_ins_id;
...@@ -301,7 +362,8 @@ void MultiSlotDataFeed::Init( ...@@ -301,7 +362,8 @@ void MultiSlotDataFeed::Init(
paddle::framework::MultiSlotDesc multi_slot_desc = paddle::framework::MultiSlotDesc multi_slot_desc =
data_feed_desc.multi_slot_desc(); data_feed_desc.multi_slot_desc();
SetBatchSize(data_feed_desc.batch_size()); SetBatchSize(data_feed_desc.batch_size());
SetQueueSize(data_feed_desc.batch_size()); // temporarily set queue size = batch size * 100
SetQueueSize(data_feed_desc.batch_size() * 100);
size_t all_slot_num = multi_slot_desc.slots_size(); size_t all_slot_num = multi_slot_desc.slots_size();
all_slots_.resize(all_slot_num); all_slots_.resize(all_slot_num);
all_slots_type_.resize(all_slot_num); all_slots_type_.resize(all_slot_num);
...@@ -610,15 +672,16 @@ void MultiSlotDataFeed::PutToFeedVec( ...@@ -610,15 +672,16 @@ void MultiSlotDataFeed::PutToFeedVec(
if (type[0] == 'f') { // float if (type[0] == 'f') { // float
const auto& feasign = ins_vec[i].GetFloatData(); const auto& feasign = ins_vec[i].GetFloatData();
float* tensor_ptr = feed_vec_[i]->mutable_data<float>( float* tensor_ptr =
{total_instance, 1}, platform::CPUPlace()); feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); CopyToFeedTensor(tensor_ptr, &feasign[0], total_instance * sizeof(float));
} else if (type[0] == 'u') { // uint64 } else if (type[0] == 'u') { // uint64
// no uint64_t type in paddlepaddle // no uint64_t type in paddlepaddle
const auto& feasign = ins_vec[i].GetUint64Data(); const auto& feasign = ins_vec[i].GetUint64Data();
int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>( int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
{total_instance, 1}, platform::CPUPlace()); {total_instance, 1}, this->place_);
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); CopyToFeedTensor(tensor_ptr, &feasign[0],
total_instance * sizeof(int64_t));
} }
LoD data_lod{offset}; LoD data_lod{offset};
...@@ -709,6 +772,18 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) { ...@@ -709,6 +772,18 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
pos += len + 1; pos += len + 1;
VLOG(3) << "ins_id " << instance->ins_id_; VLOG(3) << "ins_id " << instance->ins_id_;
} }
if (parse_content_) {
int num = strtol(&str[pos], &endptr, 10);
CHECK(num == 1); // NOLINT
pos = endptr - str + 1;
size_t len = 0;
while (str[pos + len] != ' ') {
++len;
}
instance->content_ = std::string(str + pos, len);
pos += len + 1;
VLOG(3) << "content " << instance->content_;
}
for (size_t i = 0; i < use_slots_index_.size(); ++i) { for (size_t i = 0; i < use_slots_index_.size(); ++i) {
int idx = use_slots_index_[i]; int idx = use_slots_index_[i];
int num = strtol(&str[pos], &endptr, 10); int num = strtol(&str[pos], &endptr, 10);
...@@ -833,8 +908,14 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( ...@@ -833,8 +908,14 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
std::vector<std::vector<size_t>> offset(use_slots_.size(), std::vector<std::vector<size_t>> offset(use_slots_.size(),
std::vector<size_t>{0}); std::vector<size_t>{0});
std::vector<bool> visit(use_slots_.size(), false); std::vector<bool> visit(use_slots_.size(), false);
ins_content_vec_.clear();
ins_content_vec_.reserve(ins_vec.size());
ins_id_vec_.clear();
ins_id_vec_.reserve(ins_vec.size());
for (size_t i = 0; i < ins_vec.size(); ++i) { for (size_t i = 0; i < ins_vec.size(); ++i) {
auto& r = ins_vec[i]; auto& r = ins_vec[i];
ins_id_vec_.push_back(r.ins_id_);
ins_content_vec_.push_back(r.content_);
for (auto& item : r.float_feasigns_) { for (auto& item : r.float_feasigns_) {
batch_float_feasigns[item.slot()].push_back(item.sign().float_feasign_); batch_float_feasigns[item.slot()].push_back(item.sign().float_feasign_);
visit[item.slot()] = true; visit[item.slot()] = true;
...@@ -872,15 +953,15 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( ...@@ -872,15 +953,15 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
const auto& type = all_slots_type_[i]; const auto& type = all_slots_type_[i];
if (type[0] == 'f') { // float if (type[0] == 'f') { // float
float* feasign = batch_float_feasigns[i].data(); float* feasign = batch_float_feasigns[i].data();
float* tensor_ptr = feed_vec_[i]->mutable_data<float>( float* tensor_ptr =
{total_instance, 1}, platform::CPUPlace()); feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
memcpy(tensor_ptr, feasign, total_instance * sizeof(float)); CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(float));
} else if (type[0] == 'u') { // uint64 } else if (type[0] == 'u') { // uint64
// no uint64_t type in paddlepaddle // no uint64_t type in paddlepaddle
uint64_t* feasign = batch_uint64_feasigns[i].data(); uint64_t* feasign = batch_uint64_feasigns[i].data();
int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>( int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
{total_instance, 1}, platform::CPUPlace()); {total_instance, 1}, this->place_);
memcpy(tensor_ptr, feasign, total_instance * sizeof(int64_t)); CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(int64_t));
} }
auto& slot_offset = offset[i]; auto& slot_offset = offset[i];
LoD data_lod{slot_offset}; LoD data_lod{slot_offset};
...@@ -906,15 +987,16 @@ void PrivateInstantDataFeed<T>::PutToFeedVec() { ...@@ -906,15 +987,16 @@ void PrivateInstantDataFeed<T>::PutToFeedVec() {
if (type[0] == 'f') { // float if (type[0] == 'f') { // float
const auto& feasign = ins_vec_[i].GetFloatData(); const auto& feasign = ins_vec_[i].GetFloatData();
float* tensor_ptr = feed_vec_[i]->mutable_data<float>( float* tensor_ptr =
{total_instance, 1}, platform::CPUPlace()); feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); CopyToFeedTensor(tensor_ptr, &feasign[0], total_instance * sizeof(float));
} else if (type[0] == 'u') { // uint64 } else if (type[0] == 'u') { // uint64
// no uint64_t type in paddlepaddle // no uint64_t type in paddlepaddle
const auto& feasign = ins_vec_[i].GetUint64Data(); const auto& feasign = ins_vec_[i].GetUint64Data();
int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>( int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
{total_instance, 1}, platform::CPUPlace()); {total_instance, 1}, this->place_);
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); CopyToFeedTensor(tensor_ptr, &feasign[0],
total_instance * sizeof(int64_t));
} }
LoD data_lod{offset}; LoD data_lod{offset};
......
...@@ -26,6 +26,7 @@ limitations under the License. */ ...@@ -26,6 +26,7 @@ limitations under the License. */
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -104,13 +105,25 @@ class DataFeed { ...@@ -104,13 +105,25 @@ class DataFeed {
virtual void SetThreadNum(int thread_num) {} virtual void SetThreadNum(int thread_num) {}
// This function will do nothing at default // This function will do nothing at default
virtual void SetParseInsId(bool parse_ins_id) {} virtual void SetParseInsId(bool parse_ins_id) {}
virtual void SetParseContent(bool parse_content) {}
virtual void SetFileListMutex(std::mutex* mutex) { virtual void SetFileListMutex(std::mutex* mutex) {
mutex_for_pick_file_ = mutex; mutex_for_pick_file_ = mutex;
} }
virtual void SetFileListIndex(size_t* file_index) { file_idx_ = file_index; } virtual void SetFileListIndex(size_t* file_index) { file_idx_ = file_index; }
virtual const std::vector<std::string>& GetInsIdVec() const {
return ins_id_vec_;
}
virtual const std::vector<std::string>& GetInsContentVec() const {
return ins_content_vec_;
}
virtual int GetCurBatchSize() { return batch_size_; }
virtual void LoadIntoMemory() { virtual void LoadIntoMemory() {
PADDLE_THROW("This function(LoadIntoMemory) is not implemented."); PADDLE_THROW("This function(LoadIntoMemory) is not implemented.");
} }
virtual void SetPlace(const paddle::platform::Place& place) {
place_ = place;
}
virtual const paddle::platform::Place& GetPlace() const { return place_; }
protected: protected:
// The following three functions are used to check if it is executed in this // The following three functions are used to check if it is executed in this
...@@ -124,6 +137,7 @@ class DataFeed { ...@@ -124,6 +137,7 @@ class DataFeed {
// This function is used to pick one file from the global filelist(thread // This function is used to pick one file from the global filelist(thread
// safe). // safe).
virtual bool PickOneFile(std::string* filename); virtual bool PickOneFile(std::string* filename);
virtual void CopyToFeedTensor(void* dst, const void* src, size_t size);
std::vector<std::string> filelist_; std::vector<std::string> filelist_;
size_t* file_idx_; size_t* file_idx_;
...@@ -158,6 +172,9 @@ class DataFeed { ...@@ -158,6 +172,9 @@ class DataFeed {
bool finish_set_filelist_; bool finish_set_filelist_;
bool finish_start_; bool finish_start_;
std::string pipe_command_; std::string pipe_command_;
std::vector<std::string> ins_id_vec_;
std::vector<std::string> ins_content_vec_;
platform::Place place_;
}; };
// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds. // PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
...@@ -215,6 +232,7 @@ class InMemoryDataFeed : public DataFeed { ...@@ -215,6 +232,7 @@ class InMemoryDataFeed : public DataFeed {
virtual void SetThreadId(int thread_id); virtual void SetThreadId(int thread_id);
virtual void SetThreadNum(int thread_num); virtual void SetThreadNum(int thread_num);
virtual void SetParseInsId(bool parse_ins_id); virtual void SetParseInsId(bool parse_ins_id);
virtual void SetParseContent(bool parse_content);
virtual void LoadIntoMemory(); virtual void LoadIntoMemory();
protected: protected:
...@@ -225,6 +243,7 @@ class InMemoryDataFeed : public DataFeed { ...@@ -225,6 +243,7 @@ class InMemoryDataFeed : public DataFeed {
int thread_id_; int thread_id_;
int thread_num_; int thread_num_;
bool parse_ins_id_; bool parse_ins_id_;
bool parse_content_;
std::ifstream file_; std::ifstream file_;
std::shared_ptr<FILE> fp_; std::shared_ptr<FILE> fp_;
paddle::framework::ChannelObject<T>* input_channel_; paddle::framework::ChannelObject<T>* input_channel_;
...@@ -419,6 +438,42 @@ struct Record { ...@@ -419,6 +438,42 @@ struct Record {
std::vector<FeatureItem> uint64_feasigns_; std::vector<FeatureItem> uint64_feasigns_;
std::vector<FeatureItem> float_feasigns_; std::vector<FeatureItem> float_feasigns_;
std::string ins_id_; std::string ins_id_;
std::string content_;
};
struct RecordCandidate {
std::string ins_id_;
std::unordered_multimap<uint16_t, FeatureKey> feas;
RecordCandidate& operator=(const Record& rec) {
feas.clear();
ins_id_ = rec.ins_id_;
for (auto& fea : rec.uint64_feasigns_) {
feas.insert({fea.slot(), fea.sign()});
}
return *this;
}
};
class RecordCandidateList {
public:
RecordCandidateList() = default;
RecordCandidateList(const RecordCandidateList&) = delete;
RecordCandidateList& operator=(const RecordCandidateList&) = delete;
void ReSize(size_t length);
void ReInit();
void AddAndGet(const Record& record, RecordCandidate* result);
private:
size_t _capacity = 0;
std::mutex _mutex;
bool _full = false;
size_t _cur_size = 0;
size_t _total_size = 0;
std::vector<RecordCandidate> _candidate_list;
}; };
template <class AR> template <class AR>
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/fluid/platform/mkldnn_reuse.h"
#endif #endif
...@@ -121,28 +120,35 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, ...@@ -121,28 +120,35 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
const Tensor& in, Tensor* out) { const Tensor& in, Tensor* out) {
auto in_layout = kernel_type_for_var.data_layout_; auto in_layout = kernel_type_for_var.data_layout_;
auto out_layout = expected_kernel_type.data_layout_; auto out_layout = expected_kernel_type.data_layout_;
auto place = expected_kernel_type.place_;
PADDLE_ENFORCE( PADDLE_ENFORCE(
in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN, in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN,
"TransDataLayoutFromMKLDNN only supports transform from MKLDNN to " "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
"non-MKLDNN"); "non-MKLDNN");
innerTransDataLayoutFromMKLDNN(in_layout, out_layout, in, out, place);
}
void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
const Tensor& in, Tensor* out,
platform::Place place) {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
PADDLE_ENFORCE(in.format() != memory::format::format_undef && PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::format_undef,
in.format() != memory::format::any, "Input tensor should have specified memory format");
"Input tensor should have specified memory format"); PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::any,
"Input tensor should have specified memory format");
// Set default as NCHW in case not specified // Set default as NCHW in case not specified
out_layout = out_layout =
out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
auto& pool = platform::DeviceContextPool::Instance(); auto& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>( auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
pool.Get(expected_kernel_type.place_));
auto& cpu_engine = dev_ctx->GetEngine(); auto& cpu_engine = dev_ctx->GetEngine();
std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims()); auto in_tz = paddle::framework::vectorize<int>(in.dims());
std::vector<int> out_tz = in_tz; auto out_tz = in_tz;
memory::data_type in_type = ToMKLDNNDataType(in.type()); memory::data_type in_type = ToMKLDNNDataType(in.type());
PADDLE_ENFORCE(in_type != memory::data_type::data_undef, PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
...@@ -157,15 +163,15 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, ...@@ -157,15 +163,15 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
if (in_format != out_format) { if (in_format != out_format) {
void* in_data = GetDataFromTensor(in, in_type); void* in_data = GetDataFromTensor(in, in_type);
const std::string key = platform::ReorderMKLDNNHandler::GetHash( const std::string key = platform::CreateKey(in_tz, in_format, out_format,
in_tz, in_format, out_format, std::to_string(in_type)); std::to_string(in_type));
platform::ReorderMKLDNNHandler handler(in_tz, in.type(), in_type, *dev_ctx, platform::ReorderMKLDNNHandler handler(in_tz, in.type(), in_type, *dev_ctx,
cpu_engine, key); cpu_engine, key);
auto reorder_src_memory_p = handler.AcquireSrcMemory(in_format, in_data); auto reorder_src_memory_p = handler.AcquireSrcMemory(in_format, in_data);
auto reorder_dst_memory_p = auto reorder_dst_memory_p =
handler.AcquireDstMemory(out, out_format, expected_kernel_type.place_); handler.AcquireDstMemory(out, out_format, place);
auto reorder_p = auto reorder_p =
handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
...@@ -177,7 +183,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, ...@@ -177,7 +183,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
} }
out->set_layout(out_layout); out->set_layout(out_layout);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel // reset format since the out tensor will be feed to non-MKLDNN OPkernel
out->set_format(memory::format::format_undef); out->set_format(MKLDNNMemoryFormat::format_undef);
#endif #endif
} }
......
...@@ -21,30 +21,33 @@ ...@@ -21,30 +21,33 @@
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
using MKLDNNFormat = mkldnn::memory::format;
using MKLDNNDataType = mkldnn::memory::data_type; using MKLDNNDataType = mkldnn::memory::data_type;
inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) { inline MKLDNNMemoryFormat ToMKLDNNFormat(const DataLayout& layout) {
switch (layout) { switch (layout) {
case DataLayout::kNHWC: case DataLayout::kNHWC:
return MKLDNNFormat::nhwc; return MKLDNNMemoryFormat::nhwc;
case DataLayout::kNCHW: case DataLayout::kNCHW:
return MKLDNNFormat::nchw; return MKLDNNMemoryFormat::nchw;
default: default:
PADDLE_THROW("Fail to convert layout %s to MKLDNN format", PADDLE_THROW("Fail to convert layout %s to MKLDNN format",
DataLayoutToString(layout)); DataLayoutToString(layout));
} }
} }
inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) { inline DataLayout ToPaddleLayout(const MKLDNNMemoryFormat& format) {
switch (format) { switch (format) {
case MKLDNNFormat::nhwc: case MKLDNNMemoryFormat::nhwc:
return DataLayout::kNHWC; return DataLayout::kNHWC;
case MKLDNNFormat::nchw: case MKLDNNMemoryFormat::nchw:
return DataLayout::kNCHW; return DataLayout::kNCHW;
default: default:
PADDLE_THROW("Fail to convert MKLDNN format to paddle layout"); PADDLE_THROW("Fail to convert MKLDNN format to paddle layout");
...@@ -69,6 +72,10 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, ...@@ -69,6 +72,10 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_type, const OpKernelType& expected_kernel_type,
const Tensor& in, Tensor* out); const Tensor& in, Tensor* out);
void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
const Tensor& in, Tensor* out,
platform::Place place);
std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to); std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
void TransDataLayout(const OpKernelType& kernel_type_for_var, void TransDataLayout(const OpKernelType& kernel_type_for_var,
......
...@@ -42,12 +42,16 @@ DatasetImpl<T>::DatasetImpl() { ...@@ -42,12 +42,16 @@ DatasetImpl<T>::DatasetImpl() {
channel_num_ = 1; channel_num_ = 1;
file_idx_ = 0; file_idx_ = 0;
cur_channel_ = 0; cur_channel_ = 0;
fleet_send_batch_size_ = 80000; fleet_send_batch_size_ = 1024;
fleet_send_sleep_seconds_ = 2; fleet_send_sleep_seconds_ = 0;
merge_by_insid_ = false; merge_by_insid_ = false;
erase_duplicate_feas_ = true; erase_duplicate_feas_ = true;
keep_unmerged_ins_ = true; keep_unmerged_ins_ = true;
min_merge_size_ = 2; min_merge_size_ = 2;
parse_ins_id_ = false;
parse_content_ = false;
preload_thread_num_ = 0;
global_index_ = 0;
} }
// set filelist, file_idx_ will reset to zero. // set filelist, file_idx_ will reset to zero.
...@@ -103,17 +107,36 @@ void DatasetImpl<T>::SetChannelNum(int channel_num) { ...@@ -103,17 +107,36 @@ void DatasetImpl<T>::SetChannelNum(int channel_num) {
channel_num_ = channel_num; channel_num_ = channel_num;
} }
template <typename T>
void DatasetImpl<T>::SetParseInsId(bool parse_ins_id) {
parse_ins_id_ = parse_ins_id;
}
template <typename T>
void DatasetImpl<T>::SetParseContent(bool parse_content) {
parse_content_ = parse_content;
}
template <typename T> template <typename T>
void DatasetImpl<T>::SetMergeByInsId( void DatasetImpl<T>::SetMergeByInsId(
const std::vector<std::string>& merge_slot_list, bool erase_duplicate_feas, const std::vector<std::string>& merge_slot_list, bool erase_duplicate_feas,
int min_merge_size, bool keep_unmerged_ins) { int min_merge_size, bool keep_unmerged_ins) {
merge_by_insid_ = true; merge_by_insid_ = true;
parse_ins_id_ = true;
merge_slots_list_ = merge_slot_list; merge_slots_list_ = merge_slot_list;
erase_duplicate_feas_ = erase_duplicate_feas; erase_duplicate_feas_ = erase_duplicate_feas;
min_merge_size_ = min_merge_size; min_merge_size_ = min_merge_size;
keep_unmerged_ins_ = keep_unmerged_ins; keep_unmerged_ins_ = keep_unmerged_ins;
} }
template <typename T>
void DatasetImpl<T>::SetFeaEval(bool fea_eval, int record_candidate_size) {
slots_shuffle_fea_eval_ = fea_eval;
slots_shuffle_rclist_.ReSize(record_candidate_size);
VLOG(3) << "SetFeaEval fea eval mode: " << fea_eval
<< " with record candidate size: " << record_candidate_size;
}
template <typename T> template <typename T>
std::vector<paddle::framework::DataFeed*> DatasetImpl<T>::GetReaders() { std::vector<paddle::framework::DataFeed*> DatasetImpl<T>::GetReaders() {
std::vector<paddle::framework::DataFeed*> ret; std::vector<paddle::framework::DataFeed*> ret;
...@@ -182,10 +205,21 @@ void DatasetImpl<T>::LoadIntoMemory() { ...@@ -182,10 +205,21 @@ void DatasetImpl<T>::LoadIntoMemory() {
template <typename T> template <typename T>
void DatasetImpl<T>::PreLoadIntoMemory() { void DatasetImpl<T>::PreLoadIntoMemory() {
VLOG(3) << "DatasetImpl<T>::PreLoadIntoMemory() begin"; VLOG(3) << "DatasetImpl<T>::PreLoadIntoMemory() begin";
preload_threads_.clear(); if (preload_thread_num_ != 0) {
for (int64_t i = 0; i < thread_num_; ++i) { CHECK(preload_thread_num_ == preload_readers_.size());
preload_threads_.push_back(std::thread( preload_threads_.clear();
&paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get())); for (int64_t i = 0; i < preload_thread_num_; ++i) {
preload_threads_.push_back(
std::thread(&paddle::framework::DataFeed::LoadIntoMemory,
preload_readers_[i].get()));
}
} else {
CHECK(thread_num_ == readers_.size());
preload_threads_.clear();
for (int64_t i = 0; i < thread_num_; ++i) {
preload_threads_.push_back(std::thread(
&paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
}
} }
VLOG(3) << "DatasetImpl<T>::PreLoadIntoMemory() end"; VLOG(3) << "DatasetImpl<T>::PreLoadIntoMemory() end";
} }
...@@ -258,7 +292,7 @@ void DatasetImpl<T>::LocalShuffle() { ...@@ -258,7 +292,7 @@ void DatasetImpl<T>::LocalShuffle() {
} }
template <typename T> template <typename T>
void DatasetImpl<T>::GlobalShuffle() { void DatasetImpl<T>::GlobalShuffle(int thread_num) {
VLOG(3) << "DatasetImpl<T>::GlobalShuffle() begin"; VLOG(3) << "DatasetImpl<T>::GlobalShuffle() begin";
platform::Timer timeline; platform::Timer timeline;
timeline.Start(); timeline.Start();
...@@ -325,13 +359,21 @@ void DatasetImpl<T>::GlobalShuffle() { ...@@ -325,13 +359,21 @@ void DatasetImpl<T>::GlobalShuffle() {
ars.shrink_to_fit(); ars.shrink_to_fit();
data.clear(); data.clear();
data.shrink_to_fit(); data.shrink_to_fit();
sleep(this->fleet_send_sleep_seconds_); // currently we find bottleneck is server not able to handle large data
// in time, so we can remove this sleep and set fleet_send_batch_size to
// 1024, and set server thread to 24.
if (fleet_send_sleep_seconds_ != 0) {
sleep(this->fleet_send_sleep_seconds_);
}
} }
}; };
VLOG(3) << "start global shuffle threads";
std::vector<std::thread> global_shuffle_threads; std::vector<std::thread> global_shuffle_threads;
for (int i = 0; i < thread_num_; ++i) { if (thread_num == -1) {
thread_num = thread_num_;
}
VLOG(3) << "start global shuffle threads, num = " << thread_num;
for (int i = 0; i < thread_num; ++i) {
global_shuffle_threads.push_back(std::thread(global_shuffle_func)); global_shuffle_threads.push_back(std::thread(global_shuffle_func));
} }
for (std::thread& t : global_shuffle_threads) { for (std::thread& t : global_shuffle_threads) {
...@@ -345,6 +387,101 @@ void DatasetImpl<T>::GlobalShuffle() { ...@@ -345,6 +387,101 @@ void DatasetImpl<T>::GlobalShuffle() {
<< timeline.ElapsedSec() << " seconds"; << timeline.ElapsedSec() << " seconds";
} }
template <typename T>
void DatasetImpl<T>::DynamicAdjustChannelNum(int channel_num) {
if (channel_num_ == channel_num) {
VLOG(3) << "DatasetImpl<T>::DynamicAdjustChannelNum channel_num_="
<< channel_num_ << ", channel_num_=channel_num, no need to adjust";
return;
}
VLOG(3) << "adjust channel num from " << channel_num_ << " to "
<< channel_num;
channel_num_ = channel_num;
std::vector<paddle::framework::Channel<T>>* origin_channels = nullptr;
std::vector<paddle::framework::Channel<T>>* other_channels = nullptr;
// find out which channel (output or consume) has data
int cur_channel = 0;
uint64_t output_channels_data_size = 0;
uint64_t consume_channels_data_size = 0;
CHECK(multi_output_channel_.size() == multi_consume_channel_.size());
for (int i = 0; i < multi_output_channel_.size(); ++i) {
output_channels_data_size += multi_output_channel_[i]->Size();
consume_channels_data_size += multi_consume_channel_[i]->Size();
}
if (output_channels_data_size != 0) {
CHECK(consume_channels_data_size == 0); // NOLINT
cur_channel = 0;
} else {
CHECK(output_channels_data_size == 0); // NOLINT
cur_channel = 1;
}
if (cur_channel == 0) {
origin_channels = &multi_output_channel_;
other_channels = &multi_consume_channel_;
} else {
origin_channels = &multi_consume_channel_;
other_channels = &multi_output_channel_;
}
CHECK(origin_channels != nullptr); // NOLINT
CHECK(other_channels != nullptr); // NOLINT
paddle::framework::Channel<T> total_data_channel =
paddle::framework::MakeChannel<T>();
std::vector<paddle::framework::Channel<T>> new_channels;
std::vector<paddle::framework::Channel<T>> new_other_channels;
std::vector<T> local_vec;
for (int i = 0; i < origin_channels->size(); ++i) {
local_vec.clear();
(*origin_channels)[i]->Close();
(*origin_channels)[i]->ReadAll(local_vec);
total_data_channel->Write(std::move(local_vec));
}
total_data_channel->Close();
total_data_channel->SetBlockSize(total_data_channel->Size() / channel_num +
1);
for (int i = 0; i < channel_num; ++i) {
local_vec.clear();
total_data_channel->Read(local_vec);
new_other_channels.push_back(paddle::framework::MakeChannel<T>());
new_channels.push_back(paddle::framework::MakeChannel<T>());
new_channels[i]->Write(std::move(local_vec));
}
total_data_channel->Clear();
origin_channels->clear();
other_channels->clear();
*origin_channels = new_channels;
*other_channels = new_other_channels;
new_channels.clear();
new_other_channels.clear();
std::vector<paddle::framework::Channel<T>>().swap(new_channels);
std::vector<paddle::framework::Channel<T>>().swap(new_other_channels);
local_vec.clear();
std::vector<T>().swap(local_vec);
VLOG(3) << "adjust channel num done";
}
template <typename T>
void DatasetImpl<T>::DynamicAdjustReadersNum(int thread_num) {
if (thread_num_ == thread_num) {
VLOG(3) << "DatasetImpl<T>::DynamicAdjustReadersNum thread_num_="
<< thread_num_ << ", thread_num_=thread_num, no need to adjust";
return;
}
VLOG(3) << "adjust readers num from " << thread_num_ << " to " << thread_num;
thread_num_ = thread_num;
std::vector<std::shared_ptr<paddle::framework::DataFeed>>().swap(readers_);
CreateReaders();
VLOG(3) << "adjust readers num done";
}
template <typename T>
void DatasetImpl<T>::SetFleetSendSleepSeconds(int seconds) {
fleet_send_sleep_seconds_ = seconds;
}
template <typename T> template <typename T>
void DatasetImpl<T>::CreateReaders() { void DatasetImpl<T>::CreateReaders() {
VLOG(3) << "Calling CreateReaders()"; VLOG(3) << "Calling CreateReaders()";
...@@ -352,8 +489,6 @@ void DatasetImpl<T>::CreateReaders() { ...@@ -352,8 +489,6 @@ void DatasetImpl<T>::CreateReaders() {
VLOG(3) << "Filelist size in Dataset: " << filelist_.size(); VLOG(3) << "Filelist size in Dataset: " << filelist_.size();
VLOG(3) << "channel num in Dataset: " << channel_num_; VLOG(3) << "channel num in Dataset: " << channel_num_;
CHECK(thread_num_ > 0) << "thread num should > 0"; CHECK(thread_num_ > 0) << "thread num should > 0";
CHECK(thread_num_ <= filelist_.size())
<< "thread num should <= filelist size";
CHECK(channel_num_ > 0) << "channel num should > 0"; CHECK(channel_num_ > 0) << "channel num should > 0";
CHECK(channel_num_ <= thread_num_) << "channel num should <= thread num"; CHECK(channel_num_ <= thread_num_) << "channel num should <= thread num";
VLOG(3) << "readers size: " << readers_.size(); VLOG(3) << "readers size: " << readers_.size();
...@@ -372,7 +507,8 @@ void DatasetImpl<T>::CreateReaders() { ...@@ -372,7 +507,8 @@ void DatasetImpl<T>::CreateReaders() {
readers_[i]->SetFileListMutex(&mutex_for_pick_file_); readers_[i]->SetFileListMutex(&mutex_for_pick_file_);
readers_[i]->SetFileListIndex(&file_idx_); readers_[i]->SetFileListIndex(&file_idx_);
readers_[i]->SetFileList(filelist_); readers_[i]->SetFileList(filelist_);
readers_[i]->SetParseInsId(merge_by_insid_); readers_[i]->SetParseInsId(parse_ins_id_);
readers_[i]->SetParseContent(parse_content_);
if (input_channel_ != nullptr) { if (input_channel_ != nullptr) {
readers_[i]->SetInputChannel(input_channel_.get()); readers_[i]->SetInputChannel(input_channel_.get());
} }
...@@ -401,6 +537,47 @@ void DatasetImpl<T>::DestroyReaders() { ...@@ -401,6 +537,47 @@ void DatasetImpl<T>::DestroyReaders() {
cur_channel_ = 1 - cur_channel_; cur_channel_ = 1 - cur_channel_;
} }
template <typename T>
void DatasetImpl<T>::SetPreLoadThreadNum(int thread_num) {
preload_thread_num_ = thread_num;
}
template <typename T>
void DatasetImpl<T>::CreatePreLoadReaders() {
VLOG(3) << "Begin CreatePreLoadReaders";
if (preload_thread_num_ == 0) {
preload_thread_num_ = thread_num_;
}
CHECK(preload_thread_num_ > 0) << "thread num should > 0";
CHECK(input_channel_ != nullptr);
preload_readers_.clear();
for (int i = 0; i < preload_thread_num_; ++i) {
preload_readers_.push_back(
DataFeedFactory::CreateDataFeed(data_feed_desc_.name()));
preload_readers_[i]->Init(data_feed_desc_);
preload_readers_[i]->SetThreadId(i);
preload_readers_[i]->SetThreadNum(preload_thread_num_);
preload_readers_[i]->SetFileListMutex(&mutex_for_pick_file_);
preload_readers_[i]->SetFileListIndex(&file_idx_);
preload_readers_[i]->SetFileList(filelist_);
preload_readers_[i]->SetParseInsId(parse_ins_id_);
preload_readers_[i]->SetInputChannel(input_channel_.get());
preload_readers_[i]->SetOutputChannel(nullptr);
preload_readers_[i]->SetConsumeChannel(nullptr);
}
VLOG(3) << "End CreatePreLoadReaders";
}
template <typename T>
void DatasetImpl<T>::DestroyPreLoadReaders() {
VLOG(3) << "Begin DestroyPreLoadReaders";
preload_readers_.clear();
std::vector<std::shared_ptr<paddle::framework::DataFeed>>().swap(
preload_readers_);
file_idx_ = 0;
VLOG(3) << "End DestroyPreLoadReaders";
}
template <typename T> template <typename T>
int64_t DatasetImpl<T>::GetMemoryDataSize() { int64_t DatasetImpl<T>::GetMemoryDataSize() {
return input_channel_->Size(); return input_channel_->Size();
...@@ -436,7 +613,16 @@ int DatasetImpl<T>::ReceiveFromClient(int msg_type, int client_id, ...@@ -436,7 +613,16 @@ int DatasetImpl<T>::ReceiveFromClient(int msg_type, int client_id,
CHECK(ar.Cursor() == ar.Finish()); CHECK(ar.Cursor() == ar.Finish());
auto fleet_ptr = FleetWrapper::GetInstance(); auto fleet_ptr = FleetWrapper::GetInstance();
int64_t index = fleet_ptr->LocalRandomEngine()() % channel_num_; // not use random because it doesn't perform well here.
// to make sure each channel get data equally, we just put data to
// channel one by one.
// int64_t index = fleet_ptr->LocalRandomEngine()() % channel_num_;
int64_t index = 0;
{
std::unique_lock<std::mutex> lk(global_index_mutex_);
index = global_index_++;
}
index = index % channel_num_;
VLOG(3) << "ramdom index=" << index; VLOG(3) << "ramdom index=" << index;
multi_output_channel_[index]->Write(std::move(data)); multi_output_channel_[index]->Write(std::move(data));
...@@ -648,5 +834,167 @@ void MultiSlotDataset::MergeByInsId() { ...@@ -648,5 +834,167 @@ void MultiSlotDataset::MergeByInsId() {
VLOG(3) << "MultiSlotDataset::MergeByInsId end"; VLOG(3) << "MultiSlotDataset::MergeByInsId end";
} }
void MultiSlotDataset::GetRandomData(const std::set<uint16_t>& slots_to_replace,
std::vector<Record>* result) {
int debug_erase_cnt = 0;
int debug_push_cnt = 0;
auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
slots_shuffle_rclist_.ReInit();
for (const auto& rec : slots_shuffle_original_data_) {
RecordCandidate rand_rec;
Record new_rec = rec;
slots_shuffle_rclist_.AddAndGet(rec, &rand_rec);
for (auto it = new_rec.uint64_feasigns_.begin();
it != new_rec.uint64_feasigns_.end();) {
if (slots_to_replace.find(it->slot()) != slots_to_replace.end()) {
it = new_rec.uint64_feasigns_.erase(it);
debug_erase_cnt += 1;
} else {
++it;
}
}
for (auto slot : slots_to_replace) {
auto range = rand_rec.feas.equal_range(slot);
for (auto it = range.first; it != range.second; ++it) {
new_rec.uint64_feasigns_.push_back({it->second, it->first});
debug_push_cnt += 1;
}
}
result->push_back(std::move(new_rec));
}
VLOG(2) << "erase feasign num: " << debug_erase_cnt
<< " repush feasign num: " << debug_push_cnt;
}
// slots shuffle to input_channel_ with needed-shuffle slots
void MultiSlotDataset::SlotsShuffle(
const std::set<std::string>& slots_to_replace) {
int out_channel_size = 0;
if (cur_channel_ == 0) {
for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
out_channel_size += multi_output_channel_[i]->Size();
}
} else {
for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
out_channel_size += multi_consume_channel_[i]->Size();
}
}
VLOG(2) << "DatasetImpl<T>::SlotsShuffle() begin with input channel size: "
<< input_channel_->Size()
<< " output channel size: " << out_channel_size;
if (!slots_shuffle_fea_eval_) {
VLOG(3) << "DatasetImpl<T>::SlotsShuffle() end,"
"fea eval mode off, need to set on for slots shuffle";
return;
}
if ((!input_channel_ || input_channel_->Size() == 0) &&
slots_shuffle_original_data_.size() == 0 && out_channel_size == 0) {
VLOG(3) << "DatasetImpl<T>::SlotsShuffle() end, no data to slots shuffle";
return;
}
platform::Timer timeline;
timeline.Start();
auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
std::set<uint16_t> index_slots;
for (size_t i = 0; i < multi_slot_desc.slots_size(); ++i) {
std::string cur_slot = multi_slot_desc.slots(i).name();
if (slots_to_replace.find(cur_slot) != slots_to_replace.end()) {
index_slots.insert(i);
}
}
if (slots_shuffle_original_data_.size() == 0) {
// before first slots shuffle, instances could be in
// input_channel, oupput_channel or consume_channel
if (input_channel_ && input_channel_->Size() != 0) {
slots_shuffle_original_data_.reserve(input_channel_->Size());
input_channel_->Close();
input_channel_->ReadAll(slots_shuffle_original_data_);
} else {
CHECK(out_channel_size > 0); // NOLINT
if (cur_channel_ == 0) {
for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
std::vector<Record> vec_data;
multi_output_channel_[i]->Close();
multi_output_channel_[i]->ReadAll(vec_data);
slots_shuffle_original_data_.reserve(
slots_shuffle_original_data_.size() + vec_data.size());
slots_shuffle_original_data_.insert(
slots_shuffle_original_data_.end(),
std::make_move_iterator(vec_data.begin()),
std::make_move_iterator(vec_data.end()));
vec_data.clear();
vec_data.shrink_to_fit();
multi_output_channel_[i]->Clear();
}
} else {
for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
std::vector<Record> vec_data;
multi_consume_channel_[i]->Close();
multi_consume_channel_[i]->ReadAll(vec_data);
slots_shuffle_original_data_.reserve(
slots_shuffle_original_data_.size() + vec_data.size());
slots_shuffle_original_data_.insert(
slots_shuffle_original_data_.end(),
std::make_move_iterator(vec_data.begin()),
std::make_move_iterator(vec_data.end()));
vec_data.clear();
vec_data.shrink_to_fit();
multi_consume_channel_[i]->Clear();
}
}
}
} else {
// if already have original data for slots shuffle, clear channel
input_channel_->Clear();
if (cur_channel_ == 0) {
for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
if (!multi_output_channel_[i]) {
continue;
}
multi_output_channel_[i]->Clear();
}
} else {
for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
if (!multi_consume_channel_[i]) {
continue;
}
multi_consume_channel_[i]->Clear();
}
}
}
int end_size = 0;
if (cur_channel_ == 0) {
for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
if (!multi_output_channel_[i]) {
continue;
}
end_size += multi_output_channel_[i]->Size();
}
} else {
for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
if (!multi_consume_channel_[i]) {
continue;
}
end_size += multi_consume_channel_[i]->Size();
}
}
CHECK(input_channel_->Size() == 0)
<< "input channel should be empty before slots shuffle";
std::vector<Record> random_data;
random_data.clear();
// get slots shuffled random_data
GetRandomData(index_slots, &random_data);
input_channel_->Open();
input_channel_->Write(std::move(random_data));
random_data.clear();
random_data.shrink_to_fit();
input_channel_->Close();
timeline.Pause();
VLOG(2) << "DatasetImpl<T>::SlotsShuffle() end"
<< ", memory data size for slots shuffle=" << input_channel_->Size()
<< ", cost time=" << timeline.ElapsedSec() << " seconds";
}
} // end namespace framework } // end namespace framework
} // end namespace paddle } // end namespace paddle
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <fstream> #include <fstream>
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <set>
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <utility> #include <utility>
...@@ -57,10 +58,15 @@ class Dataset { ...@@ -57,10 +58,15 @@ class Dataset {
virtual void SetDataFeedDesc(const std::string& data_feed_desc_str) = 0; virtual void SetDataFeedDesc(const std::string& data_feed_desc_str) = 0;
// set channel num // set channel num
virtual void SetChannelNum(int channel_num) = 0; virtual void SetChannelNum(int channel_num) = 0;
// set parse ins id
virtual void SetParseInsId(bool parse_ins_id) = 0;
virtual void SetParseContent(bool parse_content) = 0;
// set merge by ins id // set merge by ins id
virtual void SetMergeByInsId(const std::vector<std::string>& merge_slot_list, virtual void SetMergeByInsId(const std::vector<std::string>& merge_slot_list,
bool erase_duplicate_feas, int min_merge_size, bool erase_duplicate_feas, int min_merge_size,
bool keep_unmerged_ins) = 0; bool keep_unmerged_ins) = 0;
// set fea eval mode
virtual void SetFeaEval(bool fea_eval, int record_candidate_size) = 0;
// get file list // get file list
virtual const std::vector<std::string>& GetFileList() = 0; virtual const std::vector<std::string>& GetFileList() = 0;
// get thread num // get thread num
...@@ -93,7 +99,11 @@ class Dataset { ...@@ -93,7 +99,11 @@ class Dataset {
// local shuffle data // local shuffle data
virtual void LocalShuffle() = 0; virtual void LocalShuffle() = 0;
// global shuffle data // global shuffle data
virtual void GlobalShuffle() = 0; virtual void GlobalShuffle(int thread_num = -1) = 0;
// for slots shuffle
virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace) = 0;
virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
std::vector<Record>* result) = 0;
// create readers // create readers
virtual void CreateReaders() = 0; virtual void CreateReaders() = 0;
// destroy readers // destroy readers
...@@ -104,6 +114,17 @@ class Dataset { ...@@ -104,6 +114,17 @@ class Dataset {
virtual int64_t GetShuffleDataSize() = 0; virtual int64_t GetShuffleDataSize() = 0;
// merge by ins id // merge by ins id
virtual void MergeByInsId() = 0; virtual void MergeByInsId() = 0;
// create preload readers
virtual void CreatePreLoadReaders() = 0;
// destroy preload readers after prelaod done
virtual void DestroyPreLoadReaders() = 0;
// set preload thread num
virtual void SetPreLoadThreadNum(int thread_num) = 0;
// seperate train thread and dataset thread
virtual void DynamicAdjustChannelNum(int channel_num) = 0;
virtual void DynamicAdjustReadersNum(int thread_num) = 0;
// set fleet send sleep seconds
virtual void SetFleetSendSleepSeconds(int seconds) = 0;
protected: protected:
virtual int ReceiveFromClient(int msg_type, int client_id, virtual int ReceiveFromClient(int msg_type, int client_id,
...@@ -126,13 +147,17 @@ class DatasetImpl : public Dataset { ...@@ -126,13 +147,17 @@ class DatasetImpl : public Dataset {
const std::string& fs_ugi); const std::string& fs_ugi);
virtual void SetDataFeedDesc(const std::string& data_feed_desc_str); virtual void SetDataFeedDesc(const std::string& data_feed_desc_str);
virtual void SetChannelNum(int channel_num); virtual void SetChannelNum(int channel_num);
virtual void SetParseInsId(bool parse_ins_id);
virtual void SetParseContent(bool parse_content);
virtual void SetMergeByInsId(const std::vector<std::string>& merge_slot_list, virtual void SetMergeByInsId(const std::vector<std::string>& merge_slot_list,
bool erase_duplicate_feas, int min_merge_size, bool erase_duplicate_feas, int min_merge_size,
bool keep_unmerged_ins); bool keep_unmerged_ins);
virtual void SetFeaEval(bool fea_eval, int record_candidate_size);
virtual const std::vector<std::string>& GetFileList() { return filelist_; } virtual const std::vector<std::string>& GetFileList() { return filelist_; }
virtual int GetThreadNum() { return thread_num_; } virtual int GetThreadNum() { return thread_num_; }
virtual int GetTrainerNum() { return trainer_num_; } virtual int GetTrainerNum() { return trainer_num_; }
virtual Channel<T> GetInputChannel() { return input_channel_; }
virtual int64_t GetFleetSendBatchSize() { return fleet_send_batch_size_; } virtual int64_t GetFleetSendBatchSize() { return fleet_send_batch_size_; }
virtual std::pair<std::string, std::string> GetHdfsConfig() { virtual std::pair<std::string, std::string> GetHdfsConfig() {
return std::make_pair(fs_name_, fs_ugi_); return std::make_pair(fs_name_, fs_ugi_);
...@@ -149,17 +174,27 @@ class DatasetImpl : public Dataset { ...@@ -149,17 +174,27 @@ class DatasetImpl : public Dataset {
virtual void WaitPreLoadDone(); virtual void WaitPreLoadDone();
virtual void ReleaseMemory(); virtual void ReleaseMemory();
virtual void LocalShuffle(); virtual void LocalShuffle();
virtual void GlobalShuffle(); virtual void GlobalShuffle(int thread_num = -1);
virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace) {}
virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
std::vector<Record>* result) {}
virtual void CreateReaders(); virtual void CreateReaders();
virtual void DestroyReaders(); virtual void DestroyReaders();
virtual int64_t GetMemoryDataSize(); virtual int64_t GetMemoryDataSize();
virtual int64_t GetShuffleDataSize(); virtual int64_t GetShuffleDataSize();
virtual void MergeByInsId() {} virtual void MergeByInsId() {}
virtual void CreatePreLoadReaders();
virtual void DestroyPreLoadReaders();
virtual void SetPreLoadThreadNum(int thread_num);
virtual void DynamicAdjustChannelNum(int channel_num);
virtual void DynamicAdjustReadersNum(int thread_num);
virtual void SetFleetSendSleepSeconds(int seconds);
protected: protected:
virtual int ReceiveFromClient(int msg_type, int client_id, virtual int ReceiveFromClient(int msg_type, int client_id,
const std::string& msg); const std::string& msg);
std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers_; std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers_;
std::vector<std::shared_ptr<paddle::framework::DataFeed>> preload_readers_;
paddle::framework::Channel<T> input_channel_; paddle::framework::Channel<T> input_channel_;
int channel_num_; int channel_num_;
std::vector<paddle::framework::Channel<T>> multi_output_channel_; std::vector<paddle::framework::Channel<T>> multi_output_channel_;
...@@ -168,6 +203,8 @@ class DatasetImpl : public Dataset { ...@@ -168,6 +203,8 @@ class DatasetImpl : public Dataset {
// and when finish reading, we set cur_channel = 1 - cur_channel, // and when finish reading, we set cur_channel = 1 - cur_channel,
// so if cur_channel=0, all data are in output_channel, else consume_channel // so if cur_channel=0, all data are in output_channel, else consume_channel
int cur_channel_; int cur_channel_;
std::vector<T> slots_shuffle_original_data_;
RecordCandidateList slots_shuffle_rclist_;
int thread_num_; int thread_num_;
paddle::framework::DataFeedDesc data_feed_desc_; paddle::framework::DataFeedDesc data_feed_desc_;
int trainer_num_; int trainer_num_;
...@@ -180,10 +217,16 @@ class DatasetImpl : public Dataset { ...@@ -180,10 +217,16 @@ class DatasetImpl : public Dataset {
int64_t fleet_send_sleep_seconds_; int64_t fleet_send_sleep_seconds_;
std::vector<std::thread> preload_threads_; std::vector<std::thread> preload_threads_;
bool merge_by_insid_; bool merge_by_insid_;
bool parse_ins_id_;
bool parse_content_;
bool erase_duplicate_feas_; bool erase_duplicate_feas_;
bool keep_unmerged_ins_; bool keep_unmerged_ins_;
int min_merge_size_; int min_merge_size_;
std::vector<std::string> merge_slots_list_; std::vector<std::string> merge_slots_list_;
bool slots_shuffle_fea_eval_ = false;
int preload_thread_num_;
std::mutex global_index_mutex_;
int64_t global_index_ = 0;
}; };
// use std::vector<MultiSlotType> or Record as data type // use std::vector<MultiSlotType> or Record as data type
...@@ -191,6 +234,9 @@ class MultiSlotDataset : public DatasetImpl<Record> { ...@@ -191,6 +234,9 @@ class MultiSlotDataset : public DatasetImpl<Record> {
public: public:
MultiSlotDataset() {} MultiSlotDataset() {}
virtual void MergeByInsId(); virtual void MergeByInsId();
virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace);
virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
std::vector<Record>* result);
virtual ~MultiSlotDataset() {} virtual ~MultiSlotDataset() {}
}; };
......
...@@ -48,22 +48,6 @@ bool DDim::operator==(const DDim& d) const { ...@@ -48,22 +48,6 @@ bool DDim::operator==(const DDim& d) const {
bool DDim::operator!=(const DDim& d) const { return !(*this == d); } bool DDim::operator!=(const DDim& d) const { return !(*this == d); }
std::vector<int64_t> vectorize(const DDim& ddim) {
std::vector<int64_t> result(DDim::kMaxRank);
dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
result.resize(ddim.size());
return result;
}
// NOTE: framework::vectorize converts to type int64_t
// which does not fit cudnn inputs.
std::vector<int> vectorize2int(const DDim& ddim) {
std::vector<int> result(DDim::kMaxRank);
dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
result.resize(ddim.size());
return result;
}
struct ProductVisitor { struct ProductVisitor {
template <int D> template <int D>
inline int64_t operator()(const Dim<D>& dim) { inline int64_t operator()(const Dim<D>& dim) {
......
...@@ -170,8 +170,13 @@ DDim make_ddim(const std::vector<int>& dims); ...@@ -170,8 +170,13 @@ DDim make_ddim(const std::vector<int>& dims);
*/ */
DDim make_ddim(std::initializer_list<int64_t> dims); DDim make_ddim(std::initializer_list<int64_t> dims);
std::vector<int64_t> vectorize(const DDim& ddim); template <typename T = int64_t>
std::vector<int> vectorize2int(const DDim& ddim); std::vector<T> vectorize(const DDim& ddim) {
std::vector<T> result(DDim::kMaxRank);
dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
result.resize(ddim.size());
return result;
}
int64_t product(const DDim& ddim); int64_t product(const DDim& ddim);
......
...@@ -3,7 +3,10 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context ...@@ -3,7 +3,10 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context
cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope computation_op_handle share_tensor_buffer_functor)
cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
...@@ -59,12 +62,7 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d ...@@ -59,12 +62,7 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d
cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope) set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass buffer_shared_inplace_op_pass buffer_shared_cross_op_memory_reuse_pass)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass buffer_shared_inplace_op_pass)
if (WITH_GPU)
list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
endif()
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
...@@ -82,18 +80,27 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha ...@@ -82,18 +80,27 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha
device_context broadcast_op_handle) device_context broadcast_op_handle)
cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context gather_op_handle) device_context gather_op_handle)
cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
cc_library(scope_buffered_monitor SRCS scope_buffered_monitor.cc DEPS scope profiler selected_rows)
cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor scope_buffered_monitor)
#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
# device_context reduce_op_handle ) # device_context reduce_op_handle )
cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle) cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
if(WITH_NGRAPH)
set(NGRAPH_BS_DEPS ngraph)
else()
set(NGRAPH_BS_DEPS)
endif()
cc_library(build_strategy SRCS build_strategy.cc DEPS cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass fuse_elewise_add_act_pass multi_batch_merge_pass
fuse_relu_depthwise_conv_pass fuse_relu_depthwise_conv_pass
memory_optimize_pass lock_free_optimize_pass lock_free_optimize_pass
coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass record_skip_memory_opt_vars_pass) fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
${NGRAPH_BS_DEPS})
...@@ -20,12 +20,9 @@ ...@@ -20,12 +20,9 @@
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
// asynchronous nccl allreduce or synchronous issue: #ifdef PADDLE_WITH_CUDA
// https://github.com/PaddlePaddle/Paddle/issues/15049 DECLARE_bool(sync_nccl_allreduce);
DEFINE_bool( #endif
sync_nccl_allreduce, true,
"If set true, will call `cudaStreamSynchronize(nccl_stream)`"
"after allreduce, this mode can get better performance in some scenarios.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -43,11 +40,124 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -43,11 +40,124 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places) const std::vector<platform::Place> &places)
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
}
#endif #endif
void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
WaitInputVarGenerated();
std::vector<VarHandleBase *> inputs = this->Inputs();
std::vector<VarHandleBase *> outputs = this->Outputs();
auto in_var_handles = DynamicCast<VarHandle>(inputs);
auto out_var_handles = DynamicCast<VarHandle>(outputs);
AllReduceImpl(in_var_handles, out_var_handles);
}
void AllReduceOpHandle::AllReduceImpl(
const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles) {
size_t num_places = places_.size();
PADDLE_ENFORCE_EQ(
in_var_handles.size(), num_places,
"The NoDummyInputSize should be equal to the number of places.");
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
PADDLE_ENFORCE_EQ(local_exec_scopes_.size(), num_places);
std::vector<const void *> lod_tensor_data;
std::vector<platform::Place> places;
lod_tensor_data.reserve(num_places);
places.reserve(num_places);
int64_t numel = -1;
bool is_gpu_place = false;
auto dtype = static_cast<framework::proto::VarType::Type>(0);
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto &local_scope = local_exec_scopes_[i];
auto var = local_scope->FindVar(in_var_handles[i]->name());
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found int scope.",
in_var_handles[i]->name());
auto &lod_tensor = var->Get<LoDTensor>();
if (i == 0) {
numel = static_cast<int64_t>(lod_tensor.numel());
dtype = lod_tensor.type();
is_gpu_place = platform::is_gpu_place(lod_tensor.place());
}
PADDLE_ENFORCE_EQ(numel, static_cast<int64_t>(lod_tensor.numel()));
PADDLE_ENFORCE_EQ(dtype, lod_tensor.type());
PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()));
lod_tensor_data.emplace_back(lod_tensor.data<void>());
places.emplace_back(lod_tensor.place());
VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
<< ", out_name:" << out_var_handles[i]->name();
PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
"The name of input and output should be equal.");
}
std::vector<std::string> grad_var_names;
grad_var_names.reserve(num_places);
for (auto &out_var : out_var_handles) {
grad_var_names.emplace_back(out_var->Name());
}
AllReduceFunc(lod_tensor_data, dtype, numel, places, grad_var_names);
}
void AllReduceOpHandle::AllReduceFunc(
std::vector<const void *> lod_tensor_data,
const framework::proto::VarType::Type &dtype, int64_t numel,
const std::vector<platform::Place> &places,
const std::vector<std::string> &out_var_names) {
if (is_gpu_place(places[0])) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto &p = places[i];
void *buffer = const_cast<void *>(lod_tensor_data.at(i));
all_reduce_calls.emplace_back([=] {
NCCLAllReduce(p, buffer, buffer, numel, nccl_dtype, ncclSum);
});
}
NCCLAllReduceFunc(all_reduce_calls);
#else
PADDLE_THROW("Not compiled with CUDA.");
#endif
} else { // Special handle CPU only Operator's gradient. Like CRF
auto &trg = *local_exec_scopes_[0]
->FindVar(out_var_names[0])
->GetMutable<LoDTensor>();
// Reduce All Tensor to trg in CPU
ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
VisitDataType(trg.type(), func);
for (size_t i = 1; i < local_exec_scopes_.size(); ++i) {
auto &scope = local_exec_scopes_[i];
auto &p = places[i];
auto *var = scope->FindVar(out_var_names[i]);
size_t size = numel * SizeOfType(trg.type());
RunAndRecordEvent(p, [&trg, var, p, size] {
auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
platform::CPUPlace cpu_place;
memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
});
}
}
VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype);
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void AllReduceOpHandle::RunAllReduceFuncs( void AllReduceOpHandle::NCCLAllReduceFunc(
const std::vector<std::function<void()>> &all_reduce_calls) { const std::vector<std::function<void()>> &all_reduce_calls) {
this->RunAndRecordEvent([&] { this->RunAndRecordEvent([&] {
if (all_reduce_calls.size() == 1UL) { if (all_reduce_calls.size() == 1UL) {
...@@ -83,85 +193,6 @@ void AllReduceOpHandle::RunAllReduceFuncs( ...@@ -83,85 +193,6 @@ void AllReduceOpHandle::RunAllReduceFuncs(
} }
#endif #endif
void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
WaitInputVarGenerated();
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(),
"The NoDummyInputSize should be equal to the number of places.");
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
std::vector<const LoDTensor *> lod_tensors;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &local_scope = local_exec_scopes_[i];
auto &lod_tensor =
local_scope->FindVar(in_var_handles[i]->name())->Get<LoDTensor>();
lod_tensors.emplace_back(&lod_tensor);
VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
<< ", out_name:" << out_var_handles[i]->name();
PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
"The name of input and output should be equal.");
}
if (platform::is_gpu_place(lod_tensors[0]->place())) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
int dtype = -1;
size_t numel = 0;
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &p = places_[i];
auto &lod_tensor = *lod_tensors[i];
void *buffer = const_cast<void *>(lod_tensor.data<void>());
if (dtype == -1) {
dtype = platform::ToNCCLDataType(lod_tensor.type());
}
if (numel == 0) {
numel = static_cast<size_t>(lod_tensor.numel());
}
all_reduce_calls.emplace_back([=] {
NCCLAllReduce(p, buffer, buffer, numel,
static_cast<ncclDataType_t>(dtype), ncclSum);
});
}
VLOG(10) << "allreduce size:" << numel * SizeOfType(lod_tensors[0]->type());
RunAllReduceFuncs(all_reduce_calls);
#else
PADDLE_THROW("Not compiled with CUDA");
#endif
} else { // Special handle CPU only Operator's gradient. Like CRF
auto &trg = *this->local_exec_scopes_[0]
->FindVar(out_var_handles[0]->name())
->GetMutable<framework::LoDTensor>();
// Reduce All Tensor to trg in CPU
ReduceLoDTensor func(lod_tensors, &trg);
VisitDataType(lod_tensors[0]->type(), func);
for (size_t i = 1; i < local_scopes_.size(); ++i) {
auto &scope = local_exec_scopes_[i];
auto &p = places_[i];
auto *var = scope->FindVar(out_var_handles[i]->name());
auto *dev_ctx = dev_ctxes_.at(p);
RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
auto &tensor_cpu = trg;
TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
});
}
}
}
std::string AllReduceOpHandle::Name() const { return "all_reduce"; } std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -61,9 +61,17 @@ class AllReduceOpHandle : public OpHandleBase { ...@@ -61,9 +61,17 @@ class AllReduceOpHandle : public OpHandleBase {
#endif #endif
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void RunAllReduceFuncs( void NCCLAllReduceFunc(
const std::vector<std::function<void()>> &all_reduce_calls); const std::vector<std::function<void()>> &all_reduce_calls);
#endif #endif
void AllReduceImpl(const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles);
void AllReduceFunc(std::vector<const void *> lod_tensor_data,
const framework::proto::VarType::Type &dtype,
int64_t numel, const std::vector<platform::Place> &places,
const std::vector<std::string> &out_var_handles);
}; };
} // namespace details } // namespace details
......
...@@ -38,8 +38,6 @@ void BroadcastOpHandle::RunImpl() { ...@@ -38,8 +38,6 @@ void BroadcastOpHandle::RunImpl() {
VarHandle *in_var_handle = in_var_handles[0]; VarHandle *in_var_handle = in_var_handles[0];
WaitInputVarGenerated();
BroadcastOneVar(*in_var_handle, out_var_handles, local_exec_scopes_); BroadcastOneVar(*in_var_handle, out_var_handles, local_exec_scopes_);
} }
...@@ -59,6 +57,7 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -59,6 +57,7 @@ void BroadcastOpHandle::BroadcastOneVar(
InitOutputValue(in_var_handle, out_var_handles); InitOutputValue(in_var_handle, out_var_handles);
if (platform::is_cpu_place(in_tensor.place())) { if (platform::is_cpu_place(in_tensor.place())) {
WaitInputVarGenerated();
for (auto *out_var_handle : out_var_handles) { for (auto *out_var_handle : out_var_handles) {
if (out_var_handle->IsTheSameVar(in_var_handle)) { if (out_var_handle->IsTheSameVar(in_var_handle)) {
continue; continue;
...@@ -109,6 +108,7 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -109,6 +108,7 @@ void BroadcastOpHandle::BroadcastOneVar(
}); });
} }
WaitInputVarGenerated();
this->RunAndRecordEvent([&] { this->RunAndRecordEvent([&] {
{ {
platform::NCCLGroupGuard guard; platform::NCCLGroupGuard guard;
...@@ -126,6 +126,9 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -126,6 +126,9 @@ void BroadcastOpHandle::BroadcastOneVar(
&VariableVisitor::GetMutableTensor(out_var)); &VariableVisitor::GetMutableTensor(out_var));
} }
}); });
for (auto &p : places_) {
nccl_ctxs_->DevCtx(p)->Wait();
}
#else #else
PADDLE_THROW("CUDA is not enabled."); PADDLE_THROW("CUDA is not enabled.");
#endif #endif
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "boost/optional.hpp"
#include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -88,8 +89,8 @@ struct BuildStrategy { ...@@ -88,8 +89,8 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false}; bool fuse_elewise_add_act_ops_{false};
// Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
// should not be sparse types // should not be sparse types
bool fuse_all_optimizer_ops_{false}; boost::optional<bool> fuse_all_optimizer_ops_{boost::none};
bool fuse_all_reduce_ops_{false}; boost::optional<bool> fuse_all_reduce_ops_{boost::none};
// fuse_relu_depthwise_conv can fuse the `relu -> // fuse_relu_depthwise_conv can fuse the `relu ->
// depthwise_conv` // depthwise_conv`
bool fuse_relu_depthwise_conv_{false}; bool fuse_relu_depthwise_conv_{false};
...@@ -97,7 +98,7 @@ struct BuildStrategy { ...@@ -97,7 +98,7 @@ struct BuildStrategy {
// faster. Because fusing broadcast OP equals delaying the execution of all // faster. Because fusing broadcast OP equals delaying the execution of all
// broadcast Ops, in this case, all nccl streams are used only for reduce // broadcast Ops, in this case, all nccl streams are used only for reduce
// operations for a period of time. // operations for a period of time.
bool fuse_broadcast_ops_{false}; boost::optional<bool> fuse_broadcast_ops_{boost::none};
// replace batch_norm with sync_batch_norm. // replace batch_norm with sync_batch_norm.
bool sync_batch_norm_{false}; bool sync_batch_norm_{false};
...@@ -108,19 +109,14 @@ struct BuildStrategy { ...@@ -108,19 +109,14 @@ struct BuildStrategy {
// FLAGS_use_mkldnn=false // FLAGS_use_mkldnn=false
std::unordered_set<std::string> mkldnn_enabled_op_types_; std::unordered_set<std::string> mkldnn_enabled_op_types_;
// FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4 // By default, memory_optimize would be opened if gc is disabled, and
// to open them by default, we need to solve the fetch variable issue // be closed if gc is enabled.
// TODO(liuwei1031): memory_optimize depends on kStaleProgramOpDescs, // Users can forcely enable/disable memory_optimize by setting True/False.
// it is not appropriate, because kStaleProgramOpDescs will be removed in the boost::optional<bool> memory_optimize_{boost::none};
// near future.
bool memory_optimize_{false};
// Turn on inplace by default. // Turn on inplace by default.
bool enable_inplace_{true}; bool enable_inplace_{true};
// TODO(zjl): Remove this flag when MemoryOptimizePass is refactored
bool use_legacy_memory_optimize_strategy_{false};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if // num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model. // it's distributed model.
......
...@@ -96,7 +96,8 @@ void EagerDeletionOpHandle::RunImpl() { ...@@ -96,7 +96,8 @@ void EagerDeletionOpHandle::RunImpl() {
std::deque<std::shared_ptr<memory::Allocation>> garbages; std::deque<std::shared_ptr<memory::Allocation>> garbages;
for (size_t i = 0; i < var_infos_.size(); ++i) { for (size_t i = 0; i < var_infos_.size(); ++i) {
auto *var_info = var_infos_[i]; auto *var_info = var_infos_[i];
if (var_info->IsSkipped() || !var_info->DecreaseRefCnt()) { if (var_info->IsSkippedAllMemoryOptimization() ||
!var_info->DecreaseRefCnt()) {
continue; continue;
} }
......
...@@ -31,7 +31,7 @@ struct ExecutionStrategy { ...@@ -31,7 +31,7 @@ struct ExecutionStrategy {
// iterations the framework cleans up a local execution scope. // iterations the framework cleans up a local execution scope.
// In some models, the value of this parameter has a great // In some models, the value of this parameter has a great
// influence on the performance(about 15%) of the program. // influence on the performance(about 15%) of the program.
size_t num_iteration_per_drop_scope_{1}; size_t num_iteration_per_drop_scope_{100};
// At present, the kExperimental executor is the fastest in most models. // At present, the kExperimental executor is the fastest in most models.
ExecutorType type_{kExperimental}; ExecutorType type_{kExperimental};
// This debug option. // This debug option.
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include <deque>
#include <memory> #include <memory>
#include <queue>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
...@@ -191,13 +191,13 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( ...@@ -191,13 +191,13 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
const std::shared_ptr<BlockingQueue<size_t>> &complete_q) { const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
++remaining_; ++remaining_;
this->pool_.enqueue([=] { this->pool_.enqueue([=] {
std::queue<OpHandleBase *> op_queue; std::deque<OpHandleBase *> op_queue;
op_queue.push(op); op_queue.push_front(op);
size_t complete = 0; size_t complete = 0;
while (!op_queue.empty()) { while (!op_queue.empty()) {
OpHandleBase *op_to_run = op_queue.front(); OpHandleBase *op_to_run = op_queue.back();
op_queue.pop(); op_queue.pop_back();
if (!RunOp(op_to_run, complete_q, &complete)) { if (!RunOp(op_to_run, complete_q, &complete)) {
return; return;
...@@ -213,7 +213,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( ...@@ -213,7 +213,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
// NOTE(zjl): op with highest priority should run // NOTE(zjl): op with highest priority should run
// first without switching to another thread. // first without switching to another thread.
if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) { if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) {
op_queue.push(pending_op); op_queue.push_back(pending_op);
} else { } else {
if (op_to_run == nullptr) { if (op_to_run == nullptr) {
op_to_run = pending_op; op_to_run = pending_op;
...@@ -224,7 +224,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( ...@@ -224,7 +224,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
} }
} }
if (op_to_run != nullptr) op_queue.push(op_to_run); if (op_to_run != nullptr) {
op_queue.push_front(op_to_run);
}
} }
--remaining_; --remaining_;
complete_q->Push(complete); complete_q->Push(complete);
......
...@@ -61,12 +61,17 @@ void FetchOpHandle::RunImpl() { ...@@ -61,12 +61,17 @@ void FetchOpHandle::RunImpl() {
var_handle->name()); var_handle->name());
auto &t = var->Get<framework::LoDTensor>(); auto &t = var->Get<framework::LoDTensor>();
if (platform::is_gpu_place(t.place())) { if (t.IsInitialized() && t.numel() > 0) {
if (platform::is_gpu_place(t.place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TensorCopy(t, cpu, &tensors_[i]); TensorCopy(t, cpu, &tensors_[i]);
#endif #endif
} else {
tensors_[i].ShareDataWith(t);
}
} else { } else {
tensors_[i].ShareDataWith(t); tensors_[i].clear();
tensors_[i].Resize({0});
} }
tensors_[i].set_lod(t.lod()); tensors_[i].set_lod(t.lod());
} }
......
...@@ -33,28 +33,18 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle( ...@@ -33,28 +33,18 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes, ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce, const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
const platform::NCCLCommunicator *ctxs) const platform::NCCLCommunicator *ctxs)
: NCCLOpHandleBase(node, places, ctxs), : AllReduceOpHandle(node, local_scopes, places, ctxs),
local_scopes_(local_scopes), num_of_all_reduce_(num_of_all_reduce) {}
num_of_all_reduce_(num_of_all_reduce) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
}
#else #else
FusedAllReduceOpHandle::FusedAllReduceOpHandle( FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes, ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce) const std::vector<platform::Place> &places, const size_t num_of_all_reduce)
: OpHandleBase(node), : AllReduceOpHandle(node, local_scopes, places),
local_scopes_(local_scopes), num_of_all_reduce_(num_of_all_reduce) {}
places_(places),
num_of_all_reduce_(num_of_all_reduce) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
}
#endif #endif
void FusedAllReduceOpHandle::RunImpl() { void FusedAllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name()); platform::RecordEvent record_event(Name());
VLOG(4) << this->DebugString(); VLOG(4) << this->DebugString();
WaitInputVarGenerated(); WaitInputVarGenerated();
...@@ -71,6 +61,30 @@ void FusedAllReduceOpHandle::RunImpl() { ...@@ -71,6 +61,30 @@ void FusedAllReduceOpHandle::RunImpl() {
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."); "The NoDummyInputSize and NoDummyOutputSize should be equal.");
// Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused.
if (InputIsInDifferentPlace(in_var_handles)) {
for (size_t j = 0; j < num_of_all_reduce_; ++j) {
std::vector<VarHandle *> dev_inputs;
std::vector<VarHandle *> dev_outputs;
dev_inputs.reserve(place_num);
dev_outputs.reserve(place_num);
for (size_t idx = 0; idx < place_num; ++idx) {
dev_inputs.emplace_back(in_var_handles.at(j * place_num + idx));
dev_outputs.emplace_back(out_var_handles.at(j * place_num + idx));
}
AllReduceImpl(dev_inputs, dev_outputs);
}
} else {
FusedAllReduceFunc(in_var_handles, out_var_handles);
}
}
void FusedAllReduceOpHandle::FusedAllReduceFunc(
const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles) {
size_t place_num = places_.size();
GradientAndLoDTensor grads_tensor; GradientAndLoDTensor grads_tensor;
grads_tensor.resize(place_num); grads_tensor.resize(place_num);
...@@ -87,14 +101,11 @@ void FusedAllReduceOpHandle::RunImpl() { ...@@ -87,14 +101,11 @@ void FusedAllReduceOpHandle::RunImpl() {
static_cast<framework::proto::VarType::Type>(0); static_cast<framework::proto::VarType::Type>(0);
GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num); GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num);
if (numel == -1) { if (scope_idx == 0) {
numel = element_num; numel = element_num;
}
if (dtype == static_cast<framework::proto::VarType::Type>(0)) {
dtype = ele_dtype; dtype = ele_dtype;
PADDLE_ENFORCE_NE(ele_dtype,
static_cast<framework::proto::VarType::Type>(0));
} }
PADDLE_ENFORCE_EQ(ele_dtype, dtype); PADDLE_ENFORCE_EQ(ele_dtype, dtype);
// Check whether the address space is contiguous. // Check whether the address space is contiguous.
...@@ -134,66 +145,36 @@ void FusedAllReduceOpHandle::RunImpl() { ...@@ -134,66 +145,36 @@ void FusedAllReduceOpHandle::RunImpl() {
} }
std::vector<const void *> lod_tensor_data; std::vector<const void *> lod_tensor_data;
lod_tensor_data.reserve(place_num);
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
auto data = grads_tensor.at(scope_idx).at(0).second->data<void>(); auto data = grads_tensor.at(scope_idx).at(0).second->data<void>();
lod_tensor_data.emplace_back(data); lod_tensor_data.emplace_back(data);
} }
std::vector<std::string> grad_var_names;
grad_var_names.reserve(place_num);
for (auto &grad_t : grads_tensor) {
grad_var_names.emplace_back(grad_t.at(0).first);
}
if (platform::is_gpu_place(places_[0])) { AllReduceFunc(lod_tensor_data, dtype, numel, this->places_, grad_var_names);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) }
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
int nccl_dtype = platform::ToNCCLDataType(dtype);
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &p = places_[i];
void *buffer = const_cast<void *>(lod_tensor_data.at(i));
all_reduce_calls.emplace_back([=] {
NCCLAllReduce(p, buffer, buffer, numel,
static_cast<ncclDataType_t>(nccl_dtype), ncclSum);
});
}
VLOG(10) << "fusedallreduce size:" << numel * SizeOfType(dtype); bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
const std::vector<VarHandle *> &in_var_handles) const {
this->RunAndRecordEvent([&] { for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
if (all_reduce_calls.size() == 1UL) { auto *local_scope = local_exec_scopes_[scope_idx];
// Do not use NCCLGroup when manage NCCL by per thread per device size_t place_num = places_.size();
all_reduce_calls[0](); for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
} else { auto var_name = in_var_handles[j]->name();
platform::NCCLGroupGuard guard; auto var = local_scope->FindVar(var_name);
for (auto &call : all_reduce_calls) { PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
call(); auto &lod_tensor = var->Get<LoDTensor>();
} if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
return true;
} }
});
#else
PADDLE_THROW("Not compiled with CUDA");
#endif
} else {
// Special handle CPU only Operator's gradient. Like CRF
auto grad_name = grads_tensor.at(0).at(0).first;
auto &trg = *this->local_exec_scopes_[0]
->FindVar(grad_name)
->GetMutable<framework::LoDTensor>();
// Reduce All data to trg in CPU
ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
VisitDataType(trg.type(), func);
for (size_t i = 1; i < local_exec_scopes_.size(); ++i) {
auto &scope = *local_exec_scopes_[i];
auto &p = places_[i];
auto *var = scope.FindVar(grad_name);
auto *dev_ctx = dev_ctxes_.at(p);
size_t size = numel * SizeOfType(trg.type());
RunAndRecordEvent(p, [&trg, var, dev_ctx, p, size] {
auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
platform::CPUPlace cpu_place;
memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
});
} }
} }
return false;
} }
void FusedAllReduceOpHandle::GetGradLoDTensor( void FusedAllReduceOpHandle::GetGradLoDTensor(
...@@ -202,12 +183,14 @@ void FusedAllReduceOpHandle::GetGradLoDTensor( ...@@ -202,12 +183,14 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
std::vector<std::pair<std::string, const LoDTensor *>> *grad_tensor) const { std::vector<std::pair<std::string, const LoDTensor *>> *grad_tensor) const {
auto *local_scope = local_exec_scopes_[scope_idx]; auto *local_scope = local_exec_scopes_[scope_idx];
size_t place_num = places_.size(); size_t place_num = places_.size();
for (size_t j = 0; j < in_var_handles.size(); j += place_num) { for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name(); auto var_name = in_var_handles[j]->name();
PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name()); PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
auto &lod_tensor = local_scope->FindVar(var_name)->Get<LoDTensor>(); auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx)); PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
auto &lod_tensor = var->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx),
"%s(%d) is not in the right place.", var_name, scope_idx);
grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor)); grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
} }
} }
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -30,14 +31,14 @@ namespace framework { ...@@ -30,14 +31,14 @@ namespace framework {
namespace details { namespace details {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
struct FusedAllReduceOpHandle : public NCCLOpHandleBase { struct FusedAllReduceOpHandle : public AllReduceOpHandle {
FusedAllReduceOpHandle(ir::Node *node, FusedAllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const size_t num_of_all_reduce, const size_t num_of_all_reduce,
const platform::NCCLCommunicator *ctxs); const platform::NCCLCommunicator *ctxs);
#else #else
struct FusedAllReduceOpHandle : public OpHandleBase { struct FusedAllReduceOpHandle : public AllReduceOpHandle {
FusedAllReduceOpHandle(ir::Node *node, FusedAllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
...@@ -45,22 +46,10 @@ struct FusedAllReduceOpHandle : public OpHandleBase { ...@@ -45,22 +46,10 @@ struct FusedAllReduceOpHandle : public OpHandleBase {
#endif #endif
std::string Name() const override; std::string Name() const override;
// Delay and buffer nccl_all_reduce together can significantly increase
// performance. Disable this feature by returning false.
bool IsMultiDeviceTransfer() override { return true; };
protected: protected:
void RunImpl() override; void RunImpl() override;
std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }
private: private:
std::vector<Scope *> local_scopes_;
#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
// NCCLOpHandleBase already have these attributes.
// Will polish it by class inheritance framework.
std::vector<platform::Place> places_;
#endif
size_t num_of_all_reduce_; size_t num_of_all_reduce_;
// Check the dtype of the input // Check the dtype of the input
...@@ -74,6 +63,12 @@ struct FusedAllReduceOpHandle : public OpHandleBase { ...@@ -74,6 +63,12 @@ struct FusedAllReduceOpHandle : public OpHandleBase {
const std::vector<VarHandle *> &out_var_handles, const std::vector<VarHandle *> &out_var_handles,
std::vector<std::pair<std::string, const LoDTensor *>> std::vector<std::pair<std::string, const LoDTensor *>>
*grad_tensor) const; *grad_tensor) const;
bool InputIsInDifferentPlace(
const std::vector<VarHandle *> &in_var_handles) const;
void FusedAllReduceFunc(const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles);
}; };
} // namespace details } // namespace details
......
...@@ -42,6 +42,8 @@ typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>> ...@@ -42,6 +42,8 @@ typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
GraphVars; GraphVars;
constexpr char kGraphVars[] = "vars"; constexpr char kGraphVars[] = "vars";
constexpr char kNRanks[] = "nranks";
constexpr char kPlaces[] = "places"; constexpr char kPlaces[] = "places";
constexpr char kLocalScopes[] = "local_scopes"; constexpr char kLocalScopes[] = "local_scopes";
constexpr char kNCCLCtxs[] = "nccl_ctxs"; constexpr char kNCCLCtxs[] = "nccl_ctxs";
...@@ -68,6 +70,9 @@ constexpr char kParamsAndSparseGrads[] = "params_and_sparse_grads"; ...@@ -68,6 +70,9 @@ constexpr char kParamsAndSparseGrads[] = "params_and_sparse_grads";
typedef std::vector<ProgramDesc> ProgramDescs; typedef std::vector<ProgramDesc> ProgramDescs;
constexpr char kProgramDescs[] = "program_descs"; constexpr char kProgramDescs[] = "program_descs";
typedef std::unordered_set<std::string> PinnedVars;
constexpr char kPinnedVars[] = "pinned_vars";
typedef std::vector<std::vector<std::pair<std::string, std::string>>> typedef std::vector<std::vector<std::pair<std::string, std::string>>>
GroupParamsAndGrads; GroupParamsAndGrads;
constexpr char kGroupParamsAndDenseGrads[] = "group_params_dense_grads"; constexpr char kGroupParamsAndDenseGrads[] = "group_params_dense_grads";
......
...@@ -108,6 +108,8 @@ class OpHandleBase { ...@@ -108,6 +108,8 @@ class OpHandleBase {
ir::Node *Node() { return node_; } ir::Node *Node() { return node_; }
const ir::Node *Node() const { return node_; }
void SetLocalExecScopes( void SetLocalExecScopes(
const std::unordered_map<Scope *, Scope *> &scope_map); const std::unordered_map<Scope *, Scope *> &scope_map);
......
...@@ -78,44 +78,59 @@ struct ReduceBufferData { ...@@ -78,44 +78,59 @@ struct ReduceBufferData {
} }
}; };
inline void GatherLocalSelectedRows( struct GatherLocalSelectedRowsFunctor {
const std::vector<const SelectedRows *> &src_selecte_rows_, GatherLocalSelectedRowsFunctor(
const std::vector<platform::Place> &in_places, const std::vector<const SelectedRows *> &src_selected_rows,
const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes, const std::vector<platform::Place> &in_places,
const platform::Place &out_place, SelectedRows *dst_selecte_rows) { const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
PADDLE_ENFORCE(!src_selecte_rows_.empty()); const platform::Place &out_place, SelectedRows *dst_selected_rows)
: dev_ctxes_(dev_ctxes),
std::vector<Tensor> in_tensors; in_places_(in_places),
std::vector<int64_t> out_rows; out_place_(out_place),
dst_selected_rows_(dst_selected_rows) {
for (auto in_sr_ptr : src_selecte_rows_) { PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false);
auto &in_sr = *in_sr_ptr;
in_tensors.emplace_back(in_sr.value()); std::vector<int64_t> out_rows;
out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end());
for (auto in_sr_ptr : src_selected_rows) {
auto &in_sr = *in_sr_ptr;
in_tensors_.emplace_back(in_sr.value());
out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end());
}
auto &pre_in = src_selected_rows[0];
auto &dst_tensor = *dst_selected_rows_;
dst_tensor.set_height(pre_in->height());
dst_tensor.set_rows(out_rows);
size_t rows = out_rows.size();
DDim out_dim = pre_in->GetCompleteDims();
out_dim[0] = static_cast<int64_t>(rows);
dst_tensor.mutable_value()->Resize(out_dim);
dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type());
} }
auto &pre_in = src_selecte_rows_[0]; void operator()() {
auto *out_tensor = dst_selected_rows_->mutable_value();
auto &dst_tensor = *dst_selecte_rows; // copy
dst_tensor.set_height(pre_in->height()); int s = 0, e = 0;
dst_tensor.set_rows(out_rows); for (size_t j = 0; j < in_tensors_.size(); ++j) {
size_t rows = out_rows.size(); e += in_tensors_[j].dims()[0];
DDim out_dim = pre_in->GetCompleteDims(); auto sub_out = out_tensor->Slice(s, e);
out_dim[0] = static_cast<int64_t>(rows); paddle::framework::TensorCopy(in_tensors_[j], out_place_,
dst_tensor.mutable_value()->Resize(out_dim); *(dev_ctxes_.at(in_places_[j])), &sub_out);
dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type()); s = e;
Tensor *out_tensor = dst_tensor.mutable_value(); }
// copy
int s = 0, e = 0;
for (size_t j = 0; j < in_tensors.size(); ++j) {
e += in_tensors[j].dims()[0];
auto sub_out = out_tensor->Slice(s, e);
paddle::framework::TensorCopy(in_tensors[j], out_place,
*(dev_ctxes.at(in_places[j])), &sub_out);
s = e;
} }
}
private:
const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes_;
std::vector<platform::Place> in_places_;
std::vector<Tensor> in_tensors_;
platform::Place out_place_;
SelectedRows *dst_selected_rows_;
};
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -66,8 +66,11 @@ void ReduceOpHandle::GatherSelectedRows( ...@@ -66,8 +66,11 @@ void ReduceOpHandle::GatherSelectedRows(
auto gathered_var_mid = scope->Var(gathered_var_name); auto gathered_var_mid = scope->Var(gathered_var_name);
auto gathered_select_rows = auto gathered_select_rows =
gathered_var_mid->GetMutable<framework::SelectedRows>(); gathered_var_mid->GetMutable<framework::SelectedRows>();
GatherLocalSelectedRows(src_selected_rows, in_places, dev_ctxes, out_place, GatherLocalSelectedRowsFunctor functor(
gathered_select_rows); src_selected_rows, in_places, dev_ctxes, out_place, gathered_select_rows);
WaitInputVarGenerated();
functor();
// FIXME(gongwb): remove this Wait. // FIXME(gongwb): remove this Wait.
Wait(dev_ctxes); Wait(dev_ctxes);
...@@ -167,9 +170,6 @@ void ReduceOpHandle::RunImpl() { ...@@ -167,9 +170,6 @@ void ReduceOpHandle::RunImpl() {
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var); PADDLE_ENFORCE_NOT_NULL(pre_in_var);
// Wait input done, this Wait is asynchronous operation
WaitInputVarGenerated();
// NOTE: The Places of all input tensor must be all on CPU or all on GPU. // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
std::vector<platform::Place> in_places; // used to get dev_ctx std::vector<platform::Place> in_places; // used to get dev_ctx
for (auto *in_handle : in_var_handles) { for (auto *in_handle : in_var_handles) {
...@@ -209,9 +209,11 @@ void ReduceOpHandle::RunImpl() { ...@@ -209,9 +209,11 @@ void ReduceOpHandle::RunImpl() {
// TODO(gongwb): add cpu support // TODO(gongwb): add cpu support
if (collective_context.endpoints_.size() <= 1 || if (collective_context.endpoints_.size() <= 1 ||
is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) { is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) {
GatherLocalSelectedRows(in_selected_rows, in_places, dev_ctxes_, GatherLocalSelectedRowsFunctor functor(
t_out_p, in_selected_rows, in_places, dev_ctxes_, t_out_p,
out_var->GetMutable<framework::SelectedRows>()); out_var->GetMutable<framework::SelectedRows>());
WaitInputVarGenerated();
functor();
return; return;
} }
...@@ -236,6 +238,7 @@ void ReduceOpHandle::RunImpl() { ...@@ -236,6 +238,7 @@ void ReduceOpHandle::RunImpl() {
GetInputValues<LoDTensor>(in_var_handles, var_scopes); GetInputValues<LoDTensor>(in_var_handles, var_scopes);
if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) { if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
WaitInputVarGenerated();
this->RunAndRecordEvent([&] { this->RunAndRecordEvent([&] {
// FIXME(zcd): The order of summing is important, // FIXME(zcd): The order of summing is important,
// especially when the type of data is float or double. // especially when the type of data is float or double.
...@@ -295,6 +298,7 @@ void ReduceOpHandle::RunImpl() { ...@@ -295,6 +298,7 @@ void ReduceOpHandle::RunImpl() {
}); });
} }
WaitInputVarGenerated();
this->RunAndRecordEvent([&] { this->RunAndRecordEvent([&] {
platform::NCCLGroupGuard guard; platform::NCCLGroupGuard guard;
for (auto &call : all_reduce_calls) { for (auto &call : all_reduce_calls) {
......
...@@ -38,13 +38,11 @@ struct ScaleLossGradFunctor { ...@@ -38,13 +38,11 @@ struct ScaleLossGradFunctor {
float coeff_; float coeff_;
Tensor *out_; Tensor *out_;
platform::Place place_; platform::Place place_;
OpHandleBase *op_handle_;
proto::VarType::Type out_dtype_; proto::VarType::Type out_dtype_;
platform::DeviceContext *ctx_; platform::DeviceContext *ctx_;
ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place, ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
OpHandleBase *op_handle, proto::VarType::Type dtype, proto::VarType::Type dtype, platform::DeviceContext *ctx)
platform::DeviceContext *ctx)
: coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {} : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
template <typename OutT> template <typename OutT>
...@@ -76,11 +74,11 @@ void ScaleLossGradOpHandle::RunImpl() { ...@@ -76,11 +74,11 @@ void ScaleLossGradOpHandle::RunImpl() {
tensor->Resize(make_ddim({1})); tensor->Resize(make_ddim({1}));
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_,
this->dev_ctxes_.at(place_)); this->dev_ctxes_.at(place_));
this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); }); this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
#else #else
ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr); ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, nullptr);
framework::VisitDataType(out_dtype_, func); framework::VisitDataType(out_dtype_, func);
#endif #endif
} }
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/scope_buffered_monitor.h"
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_double(local_exe_sub_scope_limit);
namespace paddle {
namespace framework {
namespace details {
static constexpr double kMB = 1 / (1024 * 1024);
static void GetTensors(Variable *var,
std::unordered_set<Tensor *> *tensor_set) {
if (var->IsType<LoDTensor>() && var->Get<LoDTensor>().IsInitialized()) {
tensor_set->insert(var->GetMutable<LoDTensor>());
} else if (var->IsType<SelectedRows>() &&
var->Get<SelectedRows>().value().IsInitialized()) {
tensor_set->insert(var->GetMutable<SelectedRows>()->mutable_value());
} else if (var->IsType<LoDTensorArray>()) {
auto *tensor_arr = var->GetMutable<LoDTensorArray>();
for (auto &t : *tensor_arr) {
if (t.IsInitialized()) {
tensor_set->insert(&t);
}
}
}
}
static void GetTensors(Scope *scope, std::unordered_set<Tensor *> *tensor_set) {
for (auto &var_name : scope->LocalVarNames()) {
GetTensors(scope->FindVar(var_name), tensor_set);
}
for (auto *kid : scope->kids()) {
GetTensors(kid, tensor_set);
}
}
static size_t GetTensorMemorySize(Scope *scope, bool clear_cpu_tensor) {
std::unordered_set<Tensor *> tensor_set;
GetTensors(scope, &tensor_set);
size_t memory_size = 0;
std::unordered_set<memory::Allocation *> allocation_set;
for (auto *tensor : tensor_set) {
if (clear_cpu_tensor && platform::is_cpu_place(tensor->place())) {
tensor->clear();
} else {
auto allocation = tensor->Holder().get();
if (!allocation_set.count(allocation)) {
memory_size += allocation->size();
allocation_set.insert(allocation);
}
}
}
return memory_size;
}
size_t GetScopeVarMemorySize(Scope *scope) {
return GetTensorMemorySize(scope, false /*clear_cpu_tensor*/);
}
ScopeBufferedMonitor::ScopeBufferedMonitor(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_exec_scopes)
: places_(places), local_exec_scopes_(local_exec_scopes) {
pre_local_exec_scopes_.resize(local_exec_scopes_.size());
post_local_exec_scopes_.resize(local_exec_scopes_.size());
}
void ScopeBufferedMonitor::Apply(const std::function<void()> &callback,
bool has_fetch) {
std::unique_ptr<platform::RecordEvent> pre_local_exec_scopes_event(
new platform::RecordEvent(
"ScopeBufferedMonitor::pre_local_exec_scopes_process"));
for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
pre_local_exec_scopes_.at(scope_id).clear();
auto scopes = local_exec_scopes_.at(scope_id)->kids();
VLOG(10) << "pre_local_exec_scopes[" << scope_id
<< "] sub-scope: " << scopes.size();
pre_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end());
}
pre_local_exec_scopes_event.reset();
callback();
std::unique_ptr<platform::RecordEvent> post_local_exec_scopes_event(
new platform::RecordEvent(
"ScopeBufferedMonitor::post_local_exec_scopes_process"));
for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
post_local_exec_scopes_.at(scope_id).clear();
auto scopes = local_exec_scopes_.at(scope_id)->kids();
VLOG(10) << "post_local_exec_scopes[" << scope_id
<< "] sub-scope: " << scopes.size();
post_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end());
}
history_local_exec_scopes_.emplace_back();
auto &incr_local_exec_scopes = history_local_exec_scopes_.back();
incr_local_exec_scopes.resize(local_exec_scopes_.size());
for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
for (auto &scope : post_local_exec_scopes_.at(scope_id)) {
if (!pre_local_exec_scopes_.at(scope_id).count(scope)) {
incr_local_exec_scopes.at(scope_id).insert(scope);
}
}
if (VLOG_IS_ON(10)) {
if (incr_local_exec_scopes.at(scope_id).size() &&
FLAGS_local_exe_sub_scope_limit > 0) {
VLOG(10)
<< "FLAGS_local_exe_sub_scope_limit is "
<< FLAGS_local_exe_sub_scope_limit
<< " MBytes now. If you don't need to limit the memory of local "
"execution scope, you should set "
"FLAGS_local_exe_sub_scope_limit=-1.";
}
std::stringstream out;
out << scope_id << " kids: ";
for (auto &scope : incr_local_exec_scopes.at(scope_id)) {
out << scope << ", ";
}
VLOG(10) << out.str();
}
}
size_t history_step = history_local_exec_scopes_.size();
if (has_fetch && history_step >= 2) {
ClearHistoryLocalExecScopes(history_step - 1);
}
// Delete CPU Memory
std::vector<size_t> gpu_memory_size_per_gpu(places_.size());
for (auto &scope_vec : history_local_exec_scopes_) {
for (size_t idx = 0; idx < scope_vec.size(); ++idx) {
for (auto &scope : scope_vec.at(idx)) {
gpu_memory_size_per_gpu.at(idx) +=
GetTensorMemorySize(scope, true /*clear_cpu_tensor*/);
}
}
}
if (VLOG_IS_ON(8)) {
for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) {
VLOG(8) << "history local exec scopes contains "
<< string::HumanReadableSize(gpu_memory_size_per_gpu.at(idx))
<< " in " << places_.at(idx);
}
}
if (FLAGS_local_exe_sub_scope_limit > 0) {
for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) {
if (gpu_memory_size_per_gpu.at(idx) / kMB >=
FLAGS_local_exe_sub_scope_limit) {
platform::DeviceContextPool::Instance().Get(places_.at(idx))->Wait();
local_exec_scopes_.at(idx)->DropKids();
}
for (auto &scope_vec : history_local_exec_scopes_) {
scope_vec.at(idx).clear();
}
}
}
}
void ScopeBufferedMonitor::ClearHistoryLocalExecScopes(size_t history_step) {
VLOG(10) << "delete pre_incr_local_exec_scopes.";
for (size_t i = 0; i < history_step; ++i) {
auto &pre_incr_local_exec_scopes = history_local_exec_scopes_.front();
for (size_t scope_idx = 0; scope_idx < pre_incr_local_exec_scopes.size();
++scope_idx) {
for (auto scope : pre_incr_local_exec_scopes[scope_idx]) {
local_exec_scopes_.at(scope_idx)->DeleteScope(scope);
}
}
history_local_exec_scopes_.pop_front();
}
}
void ScopeBufferedMonitor::ClearHistoryLocalExecScopes() {
history_local_exec_scopes_.clear();
}
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <deque>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace details {
class ScopeBufferedMonitor {
public:
ScopeBufferedMonitor(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_exec_scopes);
void Apply(const std::function<void()> &callback, bool has_fetch);
void ClearHistoryLocalExecScopes();
void ClearHistoryLocalExecScopes(size_t history_step);
private:
std::vector<platform::Place> places_;
std::vector<Scope *> local_exec_scopes_;
std::vector<std::unordered_set<Scope *>> pre_local_exec_scopes_;
std::vector<std::unordered_set<Scope *>> post_local_exec_scopes_;
std::deque<std::vector<std::unordered_set<Scope *>>>
history_local_exec_scopes_;
};
size_t GetScopeVarMemorySize(Scope *scope);
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -103,16 +104,15 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -103,16 +104,15 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
int dev_id = boost::get<platform::CUDAPlace>(place).device; int dev_id = boost::get<platform::CUDAPlace>(place).device;
auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false); auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false);
auto &nccl_ctx = nccl_ctxs->at(dev_id); auto &nccl_ctx = nccl_ctxs->at(dev_id);
auto *dev_ctx = nccl_ctxs->DevCtx(dev_id);
auto stream = nccl_ctx.stream(); auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_; auto comm = nccl_ctx.comm_;
auto &allocator =
platform::DeviceTemporaryAllocator::Instance().Get(place, stream);
int encode_size = 2 * k * sizeof(int); int encode_size = 2 * k * sizeof(int);
// dgc use ncclAllGather to get all the encoded data // dgc use ncclAllGather to get all the encoded data
// so the buffer need nranks. // so the buffer need nranks.
int buf_size = nranks_ * encode_size; int buf_size = nranks_ * encode_size;
auto tmp_ious_data = allocator.Allocate(buf_size); auto tmp_ious_data = memory::Alloc(*dev_ctx, buf_size);
void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr()); void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
...@@ -126,7 +126,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -126,7 +126,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
}); });
} }
RunAllReduceFuncs(all_reduce_calls); NCCLAllReduceFunc(all_reduce_calls);
} }
int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) { int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册