提交 73fa5ef3 编写于 作者: X xiexionghang

fork from paddlev1.4, branch:paddle_feed_news_201910

上级 2455cb5f

要显示的变更太多。

To preserve performance only 1000 of 1000+ files are displayed.
......@@ -27,18 +27,27 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
message(STATUS "AR tools: ${CMAKE_AR}")
if(WIN32)
option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
if (MSVC_STATIC_CRT)
message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
endif()
add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
else(WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations")
endif(WIN32)
find_package(CUDA QUIET)
......@@ -54,7 +63,6 @@ option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FO
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_CUSTOM_TRAINER "Turn on trainer implement by custom" OFF)
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
......@@ -66,14 +74,15 @@ option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools"
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(WITH_PSLIB "Compile with pslib support" OFF)
option(WITH_BOX_PS "Compile with box_ps support" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF)
option(WITH_HIGH_LEVEL_API_TEST "Test fluid python high-level api interface" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON)
option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ON)
option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
# PY_VERSION
if(NOT PY_VERSION)
......@@ -83,7 +92,7 @@ set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
set(CMAKE_BUILD_TYPE "Release" CACHE STRING
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
FORCE)
endif()
......@@ -122,6 +131,12 @@ endif()
if (REPLACE_ENFORCE_GLOG)
add_definitions("-DREPLACE_ENFORCE_GLOG")
endif()
if (SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$")
message("Choose the correct type of sanitizer")
return()
endif()
########################################################################################
include(external/mklml) # download mklml package
......@@ -144,15 +159,11 @@ include(external/cub)
include(external/rocprim)
include(external/xxhash) # download xxhash
include(external/dlpack)
include(external/snappy) # download snappy
include(external/snappystream) # download snappystream
include(external/warpctc) # download, build, install warpctc
include(external/yaml-cpp) # download yaml
if (NOT WIN32)
# there is no official support of nccl, cupti in windows
include(cupti)
include(external/gzstream)
endif (NOT WIN32)
if(WITH_PSLIB)
......@@ -160,6 +171,9 @@ if(WITH_PSLIB)
include(external/pslib_brpc)
include(external/pslib)
endif(WITH_PSLIB)
if(WITH_BOX_PS)
include(external/box_ps)
endif(WITH_BOX_PS)
if(WITH_DISTRIBUTE)
if(WITH_GRPC)
......@@ -211,7 +225,6 @@ if (WITH_PROFILER)
endif()
include(generic) # simplify cmake module
include(package) # set paddle packages
include(ccache) # set ccache for compilation
include(util) # set unittest and link libs
include(version) # set PADDLE_VERSION
......
......@@ -54,8 +54,8 @@ RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
RUN rm -r /root/python_build
RUN apt-get update && \
apt-get install -y --allow-downgrades patchelf \
python3 python3-dev python3-pip \
apt-get install -y --allow-downgrades --allow-change-held-packages \
patchelf python3 python3-dev python3-pip \
git python-pip python-dev python-opencv openssh-server bison \
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
......@@ -172,6 +172,11 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort
RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker
RUN pip3 --no-cache-dir install coverage
RUN pip3.6 --no-cache-dir install coverage
RUN pip3.7 --no-cache-dir install coverage
RUN pip --no-cache-dir install coverage
COPY ./python/requirements.txt /root/
RUN pip3 --no-cache-dir install -r /root/requirements.txt
RUN pip3.6 --no-cache-dir install -r /root/requirements.txt
......
# PaddlePaddle (clone from /baidu/paddlepaddle/paddle@feed-trainer)
# PaddlePaddle
Fork From http://icode.baidu.com/repos/baidu/paddlepaddle/paddle/tree/paddle_feed_news_201910 (commitid:f50e701) v1.4
English | [简体中文](./README_cn.md)
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
......@@ -18,17 +18,18 @@ learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 1.5.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
### Latest PaddlePaddle Release: [Fluid 1.5.2](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
### Install Latest Stable Release:
```
# Linux CPU
pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda10cudnn7
pip install paddlepaddle-gpu==1.5.1.post107
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.5.1.post87
pip install paddlepaddle-gpu==1.5.2.post87
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu==1.5.2.post97
# For installation on other platform, refer to http://paddlepaddle.org/
```
......@@ -76,33 +77,33 @@ Now our developers could acquire Tesla V100 online computing resources for free.
## Installation
It is recommended to read [this doc](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) on our website.
It is recommended to read [this doc](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) on our website.
## Documentation
We provide [English](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) and
[Chinese](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) documentation.
We provide [English](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) and
[Chinese](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) documentation.
- [Deep Learning 101](https://github.com/PaddlePaddle/book)
You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.4/user_guides/howto/training/multi_node_en.html)
- [Distributed Training](http://paddlepaddle.org.cn/documentation/docs/en/1.5/user_guides/howto/training/multi_node_en.html)
You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/docs/en/1.4/api/index_en.html)
- [Python API](http://paddlepaddle.org.cn/documentation/docs/en/1.5/api/index_en.html)
Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.4/advanced_usage/development/contribute_to_paddle/index_en.html)
- [How to Contribute](http://paddlepaddle.org.cn/documentation/docs/en/1.5/advanced_usage/development/contribute_to_paddle/index_en.html)
We appreciate your contributions!
## Communication
- [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
- QQ discussion group: 432676488 (PaddlePaddle).
- QQ discussion group: 796771754 (PaddlePaddle).
- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
## Copyright and License
......
......@@ -3,8 +3,8 @@
[English](./README.md) | 简体中文
[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)
[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
......@@ -16,17 +16,18 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效
跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
### PaddlePaddle最新版本: [Fluid 1.5.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
### PaddlePaddle最新版本: [Fluid 1.5.2](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
### 安装最新稳定版本:
```
# Linux CPU
pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda10cudnn7
pip install paddlepaddle-gpu==1.5.1.post107
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.5.1.post87
pip install paddlepaddle-gpu==1.5.2.post87
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu==1.5.2.post97
# 其他平台上的安装指引请参考 http://paddlepaddle.org/
```
......@@ -58,33 +59,33 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型
## 安装
推荐阅读官网上的[安装说明](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html)
推荐阅读官网上的[安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html)
## 文档
我们提供[英文](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)
[中文](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) 文档
我们提供[英文](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)
[中文](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) 文档
- [深度学习101](https://github.com/PaddlePaddle/book)
或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行
- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.4/user_guides/howto/training/multi_node.html)
- [分布式训练](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/multi_node.html)
可以在MPI集群上运行分布式训练任务
- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.4/api_cn/index_cn.html)
- [Python API](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/api_cn/index_cn.html)
新的API支持代码更少更简洁的程序
- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.4/advanced_usage/development/contribute_to_paddle/index_cn.html)
- [贡献方式](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/development/contribute_to_paddle/index_cn.html)
欢迎您的贡献!
## 交流与反馈
- 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
- QQ群: 432676488 (PaddlePaddle)
- QQ群: 796771754 (PaddlePaddle)
- [论坛](http://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
## 版权和许可证
......
......@@ -62,6 +62,10 @@ if(WITH_PSLIB)
add_definitions(-DPADDLE_WITH_PSLIB)
endif()
if(WITH_BOX_PS)
add_definitions(-DPADDLE_WITH_BOX_PS)
endif()
if(WITH_GPU)
add_definitions(-DPADDLE_WITH_CUDA)
add_definitions(-DEIGEN_USE_GPU)
......@@ -88,14 +92,20 @@ if(WITH_GPU)
include_directories(${CUDA_TOOLKIT_INCLUDE})
if(TENSORRT_FOUND)
if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
endif()
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
endif()
if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
if(WIN32)
if(${CUDA_VERSION_MAJOR} VERSION_LESS 9)
message(FATAL_ERROR "TensorRT needs CUDA >= 9.0 to compile on Windows")
endif()
else()
if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
endif()
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
endif()
if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
endif()
endif()
include_directories(${TENSORRT_INCLUDE_DIR})
endif()
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import shutil
import glob
def main():
src = sys.argv[1]
dst = sys.argv[2]
if os.path.isdir(src): #copy directory
pathList = os.path.split(src)
dst = os.path.join(dst, pathList[-1])
if not os.path.exists(dst):
shutil.copytree(src, dst)
print("first copy directory: {0} --->>> {1}".format(src, dst))
else:
shutil.rmtree(dst)
shutil.copytree(src, dst)
print("overwritten copy directory: {0} --->>> {1}".format(src, dst))
else: #copy file, wildcard
if not os.path.exists(dst):
os.makedirs(dst)
srcFiles = glob.glob(src)
for srcFile in srcFiles:
shutil.copy(srcFile, dst)
print("copy file: {0} --->>> {1}".format(srcFile, dst))
if __name__ == "__main__":
main()
......@@ -186,10 +186,6 @@ list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
endif(NOT WIN32)
if(WITH_FAST_MATH)
# Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
endif()
# in cuda9, suppress cuda warning on eigen
list(APPEND CUDA_NVCC_FLAGS "-w")
# Set :expt-relaxed-constexpr to suppress Eigen warnings
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT ${WITH_BOX_PS})
return()
ENDIF(NOT ${WITH_BOX_PS})
IF(WIN32 OR APPLE)
MESSAGE(WARNING
"Windows or Mac is not supported with BOX_PS in Paddle yet."
"Force WITH_BOX_PS=OFF")
SET(WITH_BOX_PS OFF CACHE STRING "Disable BOX_PS package in Windows and MacOS" FORCE)
return()
ENDIF()
INCLUDE(ExternalProject)
SET(BOX_PS_PROJECT "extern_box_ps")
IF((NOT DEFINED BOX_PS_VER) OR (NOT DEFINED BOX_PS_URL))
MESSAGE(STATUS "use pre defined download url")
SET(BOX_PS_VER "0.1.1" CACHE STRING "" FORCE)
SET(BOX_PS_NAME "box_ps" CACHE STRING "" FORCE)
SET(BOX_PS_URL "http://box-ps.gz.bcebos.com/box_ps_stub.tar.gz" CACHE STRING "" FORCE)
ENDIF()
MESSAGE(STATUS "BOX_PS_NAME: ${BOX_PS_NAME}, BOX_PS_URL: ${BOX_PS_URL}")
SET(BOX_PS_SOURCE_DIR "${THIRD_PARTY_PATH}/box_ps")
SET(BOX_PS_DOWNLOAD_DIR "${BOX_PS_SOURCE_DIR}/src/${BOX_PS_PROJECT}")
SET(BOX_PS_DST_DIR "box_ps")
SET(BOX_PS_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(BOX_PS_INSTALL_DIR ${BOX_PS_INSTALL_ROOT}/${BOX_PS_DST_DIR})
SET(BOX_PS_ROOT ${BOX_PS_INSTALL_DIR})
SET(BOX_PS_INC_DIR ${BOX_PS_ROOT}/include)
SET(BOX_PS_LIB_DIR ${BOX_PS_ROOT}/lib)
SET(BOX_PS_LIB ${BOX_PS_LIB_DIR}/libbox_ps.so)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${BOX_PS_ROOT}/lib")
INCLUDE_DIRECTORIES(${BOX_PS_INC_DIR})
FILE(WRITE ${BOX_PS_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(BOX_PS)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${BOX_PS_NAME}/include ${BOX_PS_NAME}/lib \n"
" DESTINATION ${BOX_PS_DST_DIR})\n")
ExternalProject_Add(
${BOX_PS_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${BOX_PS_SOURCE_DIR}
DOWNLOAD_DIR ${BOX_PS_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${BOX_PS_URL} -c -q -O ${BOX_PS_NAME}.tar.gz
&& tar zxvf ${BOX_PS_NAME}.tar.gz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${BOX_PS_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BOX_PS_INSTALL_ROOT}
)
ADD_LIBRARY(box_ps SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET box_ps PROPERTY IMPORTED_LOCATION ${BOX_PS_LIB})
ADD_DEPENDENCIES(box_ps ${BOX_PS_PROJECT})
......@@ -33,7 +33,7 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF
ExternalProject_Add(
......@@ -62,7 +62,7 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest)
ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
ADD_DEPENDENCIES(brpc extern_brpc)
......
......@@ -23,14 +23,14 @@ INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
ExternalProject_Add(
extern_dgc
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/PaddlePaddle/Fleet"
GIT_TAG "2d04dc3800cdd0601f1b65d547dabcc60b0cf9dc"
URL "http://fleet.bj.bcebos.com/collective.tgz"
URL_MD5 "015d565156c3de4e30fe25473f47e7a9"
SOURCE_DIR "${DGC_SOURCES_DIR}"
CONFIGURE_COMMAND ""
BUILD_COMMAND cd collective && make -j
BUILD_COMMAND make -j
INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/ ${DGC_INCLUDE_DIR}/dgc
&& cp ${DGC_SOURCES_DIR}/collective/build/lib/libdgc.a ${DGC_LIBRARIES}
&& cp ${DGC_SOURCES_DIR}/collective/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/
&& cp ${DGC_SOURCES_DIR}/build/lib/libdgc.a ${DGC_LIBRARIES}
&& cp ${DGC_SOURCES_DIR}/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/
BUILD_IN_SOURCE 1
)
......
......@@ -3,15 +3,6 @@ INCLUDE(ExternalProject)
SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
if(NOT WITH_FAST_MATH)
# EIGEN_FAST_MATH: https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html
# enables some optimizations which might affect the accuracy of the result.
# This currently enables the SSE vectorization of sin() and cos(),
# and speedups sqrt() for single precision.
# Defined to 1 by default. Define it to 0 to disable.
add_definitions(-DEIGEN_FAST_MATH=0)
endif()
if(WIN32)
set(EIGEN_GIT_REPOSITORY https://github.com/wopeizl/eigen-git-mirror)
......
......@@ -21,6 +21,8 @@ IF(WIN32)
set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
ELSE(WIN32)
set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
set(BUILD_COMMAND $(MAKE) --silent)
set(INSTALL_COMMAND $(MAKE) install)
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
......@@ -31,6 +33,8 @@ ExternalProject_Add(
GIT_REPOSITORY "https://github.com/gflags/gflags.git"
GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a
PREFIX ${GFLAGS_SOURCES_DIR}
BUILD_COMMAND ${BUILD_COMMAND}
INSTALL_COMMAND ${INSTALL_COMMAND}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
......@@ -50,6 +54,7 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
ADD_DEPENDENCIES(gflags extern_gflags)
......
......@@ -13,6 +13,9 @@
# limitations under the License.
#FIXME:(gongwb) Move brpc's gtest dependency.
include(GNUInstallDirs)
IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
IF(WITH_TESTING)
ENABLE_TESTING()
......@@ -28,14 +31,14 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
IF(WIN32)
set(GTEST_LIBRARIES
"${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
"${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
set(GTEST_MAIN_LIBRARIES
"${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
"${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
ELSE(WIN32)
set(GTEST_LIBRARIES
"${GTEST_INSTALL_DIR}/lib/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
"${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
set(GTEST_MAIN_LIBRARIES
"${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
"${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
ENDIF(WIN32)
IF(WITH_MKLML)
......@@ -48,7 +51,7 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${GTEST_DEPENDS}
GIT_REPOSITORY "https://github.com/google/googletest.git"
GIT_TAG "release-1.8.0"
GIT_TAG "release-1.8.1"
PREFIX ${GTEST_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
......@@ -34,8 +34,6 @@ ExternalProject_Add(
BUILD_IN_SOURCE 1
)
ADD_DEPENDENCIES(extern_leveldb snappy)
ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
ADD_DEPENDENCIES(leveldb extern_leveldb)
......@@ -43,7 +43,7 @@ IF(WIN32)
ELSE()
#TODO(intel-huying):
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_VER "csrmm2_mklml_lnx_2019.0.2" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
......
......@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph")
SET(NGRAPH_GIT_TAG "4ec94acc11084a5d53418f565529310fa584899a")
SET(NGRAPH_GIT_TAG "e26d602a756f5f83e6c8220f910b61d7089fa951")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
......@@ -76,6 +76,7 @@ ExternalProject_Add(
CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}
CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
CMAKE_ARGS -NGRAPH_USE_LEGACY_MKLDNN=TRUE
)
add_dependencies(ngraph ${NGRAPH_PROJECT})
......
......@@ -58,7 +58,41 @@ IF(NOT ${CBLAS_FOUND})
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
)
ELSE()
ELSE(NOT WIN32)
SET(CBLAS_FOUND false)
SET(CBLAS_LIBRARIES
"${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}/openblas) # For openbals code to include its own headers.
INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install)
ExternalProject_Add(
extern_openblas
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git
GIT_TAG "v0.3.7"
PREFIX ${CBLAS_SOURCES_DIR}
INSTALL_DIR ${CBLAS_INSTALL_DIR}
BUILD_IN_SOURCE 0
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DBUILD_SHARED_LIBS=ON
-DMSVC_STATIC_CRT=${MSVC_STATIC_CRT}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
add_custom_command(TARGET extern_openblas POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX} ${CBLAS_INSTALL_DIR}/lib )
ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES})
ADD_DEPENDENCIES(openblas extern_openblas)
ENDIF(NOT WIN32)
SET(CBLAS_PROVIDER openblas)
ENDIF(NOT ${CBLAS_FOUND})
......
......@@ -222,6 +222,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=lib
-DBUILD_SHARED_LIBS=OFF
-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}
CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include (ExternalProject)
# NOTE: snappy is needed when linking with recordio
set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
if(WIN32)
SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
else()
SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
endif()
ExternalProject_Add(
extern_snappy
GIT_REPOSITORY "https://github.com/google/snappy"
GIT_TAG "1.1.7"
PREFIX ${SNAPPY_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF
-DSNAPPY_BUILD_TESTS:BOOL=OFF
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
IF(WIN32)
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
else(WIN32)
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
endif (WIN32)
add_library(snappy STATIC IMPORTED GLOBAL)
set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
include_directories(${SNAPPY_INCLUDE_DIR})
add_dependencies(snappy extern_snappy)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include (ExternalProject)
set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
if(WIN32)
# Fix me, VS2015 come without VLA support
set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib")
MESSAGE(WARNING, "In windows, snappystream has no compile support for windows,
please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR})
else(WIN32)
set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
ExternalProject_Add(
extern_snappystream
GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
GIT_TAG "0.2.8"
PREFIX ${SNAPPYSTREAM_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
DEPENDS snappy
)
endif(WIN32)
add_library(snappystream STATIC IMPORTED GLOBAL)
set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
add_dependencies(snappystream extern_snappystream)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include (ExternalProject)
IF(NOT ${WITH_CUSTOM_TRAINER})
return()
ENDIF(NOT ${WITH_CUSTOM_TRAINER})
set(YAML_SOURCES_DIR ${THIRD_PARTY_PATH}/yaml-cpp)
set(YAML_INSTALL_DIR ${THIRD_PARTY_PATH}/install/yaml-cpp)
set(YAML_INCLUDE_DIR "${YAML_INSTALL_DIR}/include" CACHE PATH "yaml include directory." FORCE)
SET(YAML_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
ExternalProject_Add(
extern_yaml
GIT_REPOSITORY "https://github.com/jbeder/yaml-cpp"
GIT_TAG "yaml-cpp-0.6.2"
PREFIX ${YAML_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${YAML_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${YAML_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${YAML_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF
-DYAML_BUILD_TESTS:BOOL=OFF
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${YAML_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${YAML_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
set(YAML_LIBRARIES "${YAML_INSTALL_DIR}/lib/libyaml-cpp.a")
add_library(yaml-cpp STATIC IMPORTED GLOBAL)
set_property(TARGET yaml-cpp PROPERTY IMPORTED_LOCATION ${YAML_LIBRARIES})
include_directories(${YAML_INCLUDE_DIR})
add_dependencies(yaml-cpp extern_yaml)
......@@ -37,6 +37,12 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
function(safe_set_flag is_c src_list flag_name)
string(REPLACE "-" "_" safe_name ${flag_name})
string(REPLACE "=" "_" safe_name ${safe_name})
if(${flag_name} MATCHES "fsanitize")
set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS ${flag_name})
endif()
if(is_c)
CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
......@@ -47,6 +53,10 @@ function(safe_set_flag is_c src_list flag_name)
if(${safe_name})
set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE)
endif()
if(${flag_name} MATCHES "fsanitize")
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
endif()
endfunction()
# helper macro to set cflag
......@@ -108,6 +118,20 @@ if(BARRIER_FOUND)
endif(BARRIER_FOUND)
SET(CMAKE_EXTRA_INCLUDE_FILES "")
# Only one sanitizer is allowed in compile time
string(TOLOWER "${SANITIZER_TYPE}" sanitizer_type)
if(sanitizer_type STREQUAL "address")
set(fsanitize "-fsanitize=address")
elseif(sanitizer_type STREQUAL "leak")
set(fsanitize "-fsanitize=leak")
elseif(sanitizer_type STREQUAL "memory")
set(fsanitize "-fsanitize=memory")
elseif(sanitizer_type STREQUAL "thread")
set(fsanitize "-fsanitize=thread")
elseif(sanitizer_type STREQUAL "undefined")
set(fsanitize "-fsanitize=undefined")
endif()
# Common flags. the compiler flag used for C/C++ sources whenever release or debug
# Do not care if this flag is support for gcc.
......@@ -131,7 +155,7 @@ set(COMMON_FLAGS
-Wno-error=terminate # Warning in PADDLE_ENFORCE
-Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
-Wimplicit-fallthrough=0 # Warning in tinyformat.h
-Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
${fsanitize}
)
set(GPU_COMMON_FLAGS
......@@ -173,14 +197,13 @@ endif(UNIX AND NOT APPLE)
foreach(flag ${COMMON_FLAGS})
safe_set_cflag(CMAKE_C_FLAGS ${flag})
safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
endforeach()
foreach(flag ${GPU_COMMON_FLAGS})
safe_set_nvflag(${flag})
endforeach()
if(WIN32)
if(WIN32 AND MSVC_STATIC_CRT)
# windows build turn off warnings.
safe_set_static_flag()
foreach(flag_var
......@@ -191,4 +214,4 @@ safe_set_static_flag()
string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
set(flag_var "${flag_var} /w")
endforeach(flag_var)
endif(WIN32)
endif()
......@@ -389,7 +389,6 @@ function(cc_test_run TARGET_NAME)
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
# No unit test should exceed 10 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
......@@ -472,7 +471,6 @@ function(nv_test TARGET_NAME)
add_test(${TARGET_NAME} ${TARGET_NAME})
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif()
endfunction(nv_test)
......@@ -725,7 +723,7 @@ function(py_test TARGET_NAME)
if(WITH_COVERAGE)
add_test(NAME ${TARGET_NAME}
COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G
FLAGS_cpu_deterministic=true
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
${PYTHON_EXECUTABLE} -m coverage run --branch -p ${py_test_SRCS} ${py_test_ARGS}
......@@ -733,7 +731,7 @@ function(py_test TARGET_NAME)
else()
add_test(NAME ${TARGET_NAME}
COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G
FLAGS_cpu_deterministic=true
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
......
......@@ -13,12 +13,19 @@
# limitations under the License.
# make package for paddle fluid shared and static library
if(WIN32)
if(NOT PYTHON_EXECUTABLE)
FIND_PACKAGE(PythonInterp REQUIRED)
endif()
endif()
set(COPY_SCRIPT_DIR ${PADDLE_SOURCE_DIR}/cmake)
function(copy TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DSTS DEPS)
set(multiValueArgs SRCS DSTS)
cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE)
list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
......@@ -26,43 +33,16 @@ function(copy TARGET)
message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
endif ()
math(EXPR len "${copy_lib_SRCS_len} - 1")
add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
foreach (index RANGE ${len})
list(GET copy_lib_SRCS ${index} src)
list(GET copy_lib_DSTS ${index} dst)
if (WIN32)
if(IS_DIRECTORY ${src})
get_filename_component(last_path ${src} NAME)
string(APPEND dst "/" ${last_path})
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
)
if(EXISTS ${src})
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND cmake -E copy_directory "${src}" "${dst}"
COMMENT "copying ${src} -> ${dst}")
else()
message(WARNING "${src} not exist!")
endif()
else()
# windows cmd shell will not expand wildcard automatically.
# below expand the files, and copy them by rules.
file(GLOB src_files ${src})
if (NOT "${src_files}" STREQUAL "")
list(REMOVE_DUPLICATES src_files)
endif ()
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}"
)
foreach (src_file ${src_files})
add_custom_command(TARGET ${TARGET} PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
COMMENT "copying ${src_file} -> ${dst}")
endforeach ()
endif()
else (WIN32) # not windows
add_custom_command(TARGET ${TARGET} PRE_BUILD
if (WIN32) #windows
file(TO_NATIVE_PATH ${src} native_src)
file(TO_NATIVE_PATH ${dst} native_dst)
add_custom_command(TARGET ${TARGET} POST_BUILD
COMMAND ${PYTHON_EXECUTABLE} ${COPY_SCRIPT_DIR}/copyfile.py ${native_src} ${native_dst})
else (WIN32) #not windows
add_custom_command(TARGET ${TARGET} POST_BUILD
COMMAND mkdir -p "${dst}"
COMMAND cp -r "${src}" "${dst}"
COMMENT "copying ${src} -> ${dst}")
......@@ -71,210 +51,189 @@ function(copy TARGET)
endfunction()
# third party
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
copy(eigen3_lib
SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
DEPS eigen3
)
set(third_party_deps eigen3 gflags glog boost xxhash zlib)
if(NOT PROTOBUF_FOUND OR WIN32)
list(APPEND third_party_deps extern_protobuf)
endif ()
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags")
copy(gflags_lib
SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS gflags
)
if (WITH_MKLML)
list(APPEND third_party_deps mklml)
elseif (NOT CBLAS_FOUND OR WIN32)
list(APPEND third_party_deps extern_openblas)
endif ()
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog")
copy(glog_lib
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS glog
)
if (WITH_MKLDNN)
list(APPEND third_party_deps mkldnn_shared_lib)
endif ()
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/")
copy(boost_lib
SRCS ${BOOST_INCLUDE_DIR}/boost
DSTS ${dst_dir}
DEPS boost
)
if (WITH_NGRAPH)
list(APPEND third_party_deps ngraph)
endif ()
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
copy(xxhash_lib
SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS xxhash
)
add_custom_target(third_party DEPENDS ${third_party_deps})
if (NOT PROTOBUF_FOUND OR WIN32)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
copy(protobuf_lib
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS extern_protobuf
)
endif ()
# inference-only library
set(inference_lib_deps third_party paddle_fluid paddle_fluid_shared)
add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps})
if (WITH_MKLML)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/eigen3")
copy(inference_lib_dist
SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported)
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/boost")
copy(inference_lib_dist
SRCS ${BOOST_INCLUDE_DIR}/boost
DSTS ${dst_dir})
if(WITH_MKLML)
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/mklml")
if(WIN32)
copy(mklml_lib
SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB}
${MKLML_SHARED_LIB_DEPS} ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR}
DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib
${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
DEPS mklml
)
copy(inference_lib_dist
SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB}
${MKLML_SHARED_LIB_DEPS} ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR}
DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib
${dst_dir}/lib ${dst_dir}/lib ${dst_dir})
else()
copy(mklml_lib
SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
DEPS mklml
)
copy(inference_lib_dist
SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir})
endif()
elseif (NOT CBLAS_FOUND OR WIN32)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
copy(openblas_lib
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/openblas")
copy(inference_lib_dist
SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
DSTS ${dst_dir} ${dst_dir}
DEPS extern_openblas
)
DSTS ${dst_dir} ${dst_dir})
endif ()
if (WITH_MKLDNN)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
if(WIN32)
copy(mkldnn_lib
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB}
DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib
DEPS mkldnn_shared_lib
)
else()
copy(mkldnn_lib
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS mkldnn_shared_lib
)
endif()
endif ()
if (WITH_NGRAPH)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph")
copy(ngraph_lib
SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
DSTS ${dst_dir} ${dst_dir}
DEPS ngraph
)
endif ()
if(WITH_MKLDNN)
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/mkldnn")
if(WIN32)
copy(inference_lib_dist
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB}
DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib)
else()
copy(inference_lib_dist
SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
DSTS ${dst_dir} ${dst_dir}/lib)
endif()
endif()
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/gflags")
copy(inference_lib_dist
SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
copy(snappy_lib
SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappy)
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/glog")
copy(inference_lib_dist
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
copy(snappystream_lib
SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappystream)
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/xxhash")
copy(inference_lib_dist
SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
copy(zlib_lib
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/zlib")
copy(inference_lib_dist
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS zlib)
# paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
set(module "framework")
if (NOT WIN32)
set(framework_lib_deps framework_py_proto)
endif (NOT WIN32)
copy(framework_lib DEPS ${framework_lib_deps}
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet
)
DSTS ${dst_dir} ${dst_dir}/lib)
set(module "memory")
copy(memory_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation
)
set(inference_deps paddle_fluid_shared paddle_fluid)
if (NOT PROTOBUF_FOUND OR WIN32)
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/protobuf")
copy(inference_lib_dist
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
DSTS ${dst_dir} ${dst_dir}/lib)
endif ()
set(module "inference/api")
if (WITH_NGRAPH)
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/ngraph")
copy(inference_lib_dist
SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
DSTS ${dst_dir} ${dst_dir})
endif ()
if (TENSORRT_FOUND)
copy(tensorrt_lib DEPS ${inference_deps}
SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer*
DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib)
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/tensorrt")
copy(inference_lib_dist
SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/*nvinfer*
DSTS ${dst_dir}/include ${dst_dir}/lib)
endif ()
if (ANAKIN_FOUND)
copy(anakin_lib DEPS ${inference_deps}
set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/anakin")
copy(inference_lib_dist
SRCS ${ANAKIN_ROOT}/*
DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin)
DSTS ${dst_dir})
endif ()
set(module "inference")
copy(inference_lib_dist
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INFERENCE_INSTALL_DIR})
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
if(WIN32)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*)
else(WIN32)
set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
endif(WIN32)
copy(inference_lib DEPS ${inference_deps}
SRCS ${src_dir}/${module}/*.h ${paddle_fluid_lib}
${src_dir}/${module}/api/paddle_*.h
copy(inference_lib_dist
SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib)
# fluid library for both train and inference
set(fluid_lib_deps inference_lib_dist)
add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})
set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
set(module "inference")
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
)
set(module "framework")
set(framework_lib_deps framework_proto)
add_dependencies(fluid_lib_dist ${framework_lib_deps})
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h
${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet)
set(module "memory")
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation
)
set(module "platform")
copy(platform_lib DEPS profiler_py_proto
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
set(platform_lib_deps profiler_proto)
add_dependencies(fluid_lib_dist ${platform_lib_deps})
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module}
)
set(module "string")
copy(string_lib
copy(fluid_lib_dist
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
)
set(module "pybind")
copy(pybind_lib
copy(fluid_lib_dist
SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
DSTS ${dst_dir}/${module}
)
# CMakeCache Info
copy(cmake_cache
SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INSTALL_DIR})
# This command generates a complete fluid library for both train and inference
add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep})
# Following commands generate a inference-only fluid library
# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
copy(third_party DEPS fluid_lib_dist
SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
copy(fluid_lib_dist
SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
DSTS ${FLUID_INSTALL_DIR} ${FLUID_INSTALL_DIR}
)
# only need libpaddle_fluid.so/a and paddle_*.h for inference-only library
copy(inference_api_lib DEPS fluid_lib_dist
SRCS ${paddle_fluid_lib}
${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h
DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
)
add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)
# paddle fluid version
function(version version_file)
execute_process(
......
......@@ -110,7 +110,7 @@ function(op_library TARGET)
# Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "deformable_conv_op" "dgc_op")
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()
......@@ -191,9 +191,6 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
elseif(${TARGET} STREQUAL "tensorrt_engine_op")
message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
elseif(${TARGET} STREQUAL "fc")
# HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
else()
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
endif()
......
set(CPACK_PACKAGE_NAME paddle)
set(CPACK_PACKAGE_VERSION_MAJOR ${PADDLE_MAJOR_VERSION})
set(CPACK_PACKAGE_VERSION_MINOR ${PADDLE_MINOR_VERSION})
set(CPACK_PACKAGE_VERSION_PATCH ${PADDLE_PATCH_VERSION})
set(CPACK_PACKAGE_VERSION ${PADDLE_VERSION})
## DEB Settings
set(CPACK_DEBIAN_PACKAGE_NAME paddle)
set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE amd64)
set(CPACK_DEBIAN_PACKAGE_MAINTAINER PaddlePaddle Dev <paddle-dev@baidu.com>)
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Paddle")
set(CPACK_PACKAGE_DESCRIPTION "")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl")
set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION})
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst")
#set(CPACK_GENERATOR "DEB")
# Start cpack
include (CMakePackageConfigHelpers)
include (CPack)
......@@ -2,14 +2,28 @@ if(NOT WITH_GPU)
return()
endif()
set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
if(WIN32)
if("${TENSORRT_ROOT}" STREQUAL "")
message(WARNING "Please specify the TensorRT root path: TENSORRT_ROOT.")
endif()
string(REPLACE "\\" "/" TENSORRT_ROOT "${TENSORRT_ROOT}")
set(TR_INFER_LIB nvinfer.lib)
set(TR_INFER_RT nvinfer.dll)
set(TR_INFER_PLUGIN_RT nvinfer_plugin.dll)
else()
set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
set(TR_INFER_LIB libnvinfer.a)
set(TR_INFER_RT libnvinfer.so)
set(TR_INFER_PLUGIN_RT libnvinfer_plugin.so)
endif()
find_path(TENSORRT_INCLUDE_DIR NvInfer.h
PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
$ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
NO_DEFAULT_PATH
)
find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
find_library(TENSORRT_LIBRARY NAMES ${TR_INFER_LIB} ${TR_INFER_RT}
PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
$ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
NO_DEFAULT_PATH
......
此差异已折叠。
......@@ -4,7 +4,6 @@ add_subdirectory(framework)
add_subdirectory(imperative)
add_subdirectory(operators)
add_subdirectory(string)
add_subdirectory(recordio)
add_subdirectory(pybind)
# NOTE: please add subdirectory inference at last.
......
......@@ -63,7 +63,7 @@ if(WITH_GPU)
else()
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
endif()
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
......@@ -123,8 +123,8 @@ cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_co
cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog data_feed_proto
shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
......@@ -133,7 +133,9 @@ cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc memory_optimize_helper)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
......@@ -193,18 +195,17 @@ else()
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif()
target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper)
target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper conditional_block_op_helper)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor
graph build_strategy
fast_threaded_ssa_graph_executor variable_helper)
cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_library(prune SRCS prune.cc DEPS framework_proto boost)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
proto_desc)
cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS inplace_op_pass op_registry proto_desc op_info memory_optimize_helper pass_builder)
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
......@@ -222,6 +223,9 @@ endif (NOT WIN32)
cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
cc_library(op_compatible_info SRCS op_compatible_info DEPS string_helper)
cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatible_info string_helper glog)
# Get the current working branch
execute_process(
COMMAND git rev-parse --abbrev-ref HEAD
......
......@@ -168,10 +168,10 @@ class ArchiveBase {
#else
if (newsize > Capacity()) {
#endif
Reserve(std::max(Capacity() * 2, newsize));
Reserve((std::max)(Capacity() * 2, newsize));
}
finish_ = buffer_ + newsize;
cursor_ = std::min(cursor_, finish_);
cursor_ = (std::min)(cursor_, finish_);
}
void Reserve(size_t newcap) {
......@@ -207,7 +207,7 @@ class ArchiveBase {
#else
if (size > size_t(limit_ - finish_)) {
#endif
Reserve(std::max(Capacity() * 2, Length() + size));
Reserve((std::max)(Capacity() * 2, Length() + size));
}
}
......@@ -311,6 +311,18 @@ class Archive<BinaryArchiveType> : public ArchiveBase {
*this >> x;
return x;
}
template <class... ARGS>
void Printf(const char* fmt, ARGS&&... args) {
size_t temp = Limit() - Finish();
int len = snprintf(Finish(), temp, fmt, args...);
CHECK(len >= 0); // NOLINT
if ((size_t)len >= temp) {
PrepareWrite(len + 1);
CHECK(snprintf(Finish(), (size_t)len + 1, fmt, args...) == len);
}
AdvanceFinish(len);
}
};
template <class AR, class T, size_t N>
......@@ -518,11 +530,11 @@ Archive<AR>& operator>>(Archive<AR>& ar, std::tuple<T...>& x) {
} \
template <class AR, class KEY, class VALUE, class... ARGS> \
Archive<AR>& operator>>(Archive<AR>& ar, MAP_TYPE<KEY, VALUE, ARGS...>& p) { \
size_t size = ar.template Get<size_t>(); \
size_t size = ar.template get<size_t>(); \
p.clear(); \
RESERVE_STATEMENT; \
for (size_t i = 0; i < size; i++) { \
p.insert(ar.template Get<std::pair<KEY, VALUE>>()); \
p.insert(ar.template get<std::pair<KEY, VALUE>>()); \
} \
return ar; \
}
......@@ -539,11 +551,11 @@ Archive<AR>& operator>>(Archive<AR>& ar, std::tuple<T...>& x) {
} \
template <class AR, class KEY, class VALUE, class... ARGS> \
Archive<AR>& operator>>(Archive<AR>& ar, MAP_TYPE<KEY, VALUE, ARGS...>& p) { \
size_t size = ar.template Get<uint64_t>(); \
size_t size = ar.template get<uint64_t>(); \
p.clear(); \
RESERVE_STATEMENT; \
for (size_t i = 0; i < size; i++) { \
p.insert(ar.template Get<std::pair<KEY, VALUE>>()); \
p.insert(ar.template get<std::pair<KEY, VALUE>>()); \
} \
return ar; \
}
......@@ -568,11 +580,11 @@ ARCHIVE_REPEAT(std::unordered_multimap, p.reserve(size))
} \
template <class AR, class KEY, class... ARGS> \
Archive<AR>& operator>>(Archive<AR>& ar, SET_TYPE<KEY, ARGS...>& p) { \
size_t size = ar.template Get<size_t>(); \
size_t size = ar.template get<size_t>(); \
p.clear(); \
RESERVE_STATEMENT; \
for (size_t i = 0; i < size; i++) { \
p.insert(ar.template Get<KEY>()); \
p.insert(ar.template get<KEY>()); \
} \
return ar; \
}
......@@ -588,11 +600,11 @@ ARCHIVE_REPEAT(std::unordered_multimap, p.reserve(size))
} \
template <class AR, class KEY, class... ARGS> \
Archive<AR>& operator>>(Archive<AR>& ar, SET_TYPE<KEY, ARGS...>& p) { \
size_t size = ar.template Get<uint64_t>(); \
size_t size = ar.template get<uint64_t>(); \
p.clear(); \
RESERVE_STATEMENT; \
for (size_t i = 0; i < size; i++) { \
p.insert(ar.template Get<KEY>()); \
p.insert(ar.template get<KEY>()); \
} \
return ar; \
}
......
......@@ -40,7 +40,7 @@ class ChannelObject {
// capacity can be zero
explicit ChannelObject(size_t capacity) {
capacity_ = std::min(MaxCapacity(), capacity);
capacity_ = (std::min)(MaxCapacity(), capacity);
}
void Clear() {
......@@ -192,7 +192,7 @@ class ChannelObject {
std::condition_variable full_cond_;
static constexpr size_t MaxCapacity() {
return std::numeric_limits<size_t>::max() / 2;
return (std::numeric_limits<size_t>::max)() / 2;
}
void Notify() {
......@@ -289,7 +289,7 @@ template <class T>
using Channel = std::shared_ptr<ChannelObject<T>>;
template <class T>
Channel<T> MakeChannel(size_t capacity = std::numeric_limits<size_t>::max()) {
Channel<T> MakeChannel(size_t capacity = (std::numeric_limits<size_t>::max)()) {
return std::make_shared<ChannelObject<T>>(capacity);
}
......@@ -332,7 +332,7 @@ class ChannelReader {
}
if (cursor_ >= buffer_.size()) {
cursor_ = 0;
if (channel_->Read(buffer_) == 0) {
if (channel_->read(buffer_) == 0) {
failed_ = true;
return *this;
}
......@@ -370,7 +370,7 @@ class ChannelWriter {
void Reset(ChannelObject<T>* channel) {
CHECK(buffer_.empty()) << "Forgot to flush";
CHECK(channel != nullptr) << "Channel can not be nullptr";
// CHECK(channel != nullptr) << "Channel can not be nullptr";
channel_ = channel;
buffer_.clear();
failed_ = !channel;
......
#pragma once
#include <string>
namespace paddle {
namespace framework {
static std::string paddle_commit() {
return "95c1816ec0";
}
static std::string paddle_compile_branch() {
return "develop";
}
static std::string paddle_version() {
return "0.0.0";
}
} // namespace framework
} // namespace paddle
......@@ -33,11 +33,53 @@ limitations under the License. */
#include "io/shell.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/platform/timer.h"
namespace paddle {
namespace framework {
void RecordCandidateList::ReSize(size_t length) {
_mutex.lock();
_capacity = length;
CHECK(_capacity > 0); // NOLINT
_candidate_list.clear();
_candidate_list.resize(_capacity);
_full = false;
_cur_size = 0;
_total_size = 0;
_mutex.unlock();
}
void RecordCandidateList::ReInit() {
_mutex.lock();
_full = false;
_cur_size = 0;
_total_size = 0;
_mutex.unlock();
}
void RecordCandidateList::AddAndGet(const Record& record,
RecordCandidate* result) {
_mutex.lock();
size_t index = 0;
++_total_size;
auto fleet_ptr = FleetWrapper::GetInstance();
if (!_full) {
_candidate_list[_cur_size++] = record;
_full = (_cur_size == _capacity);
} else {
CHECK(_cur_size == _capacity);
index = fleet_ptr->LocalRandomEngine()() % _total_size;
if (index < _capacity) {
_candidate_list[index] = record;
}
}
index = fleet_ptr->LocalRandomEngine()() % _cur_size;
*result = _candidate_list[index];
_mutex.unlock();
}
void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
CheckInit();
for (size_t i = 0; i < use_slots_.size(); ++i) {
......@@ -101,11 +143,24 @@ void DataFeed::AssignFeedVar(const Scope& scope) {
}
}
void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
if (platform::is_cpu_place(this->place_)) {
memcpy(dst, src, size);
} else {
#ifdef PADDLE_WITH_CUDA
cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
#else
PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
#endif
}
}
template <typename T>
void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
queue_size_ = queue_size;
queue_ = paddle::framework::MakeChannel<T>();
queue_->SetCapacity(queue_size);
}
template <typename T>
......@@ -169,6 +224,7 @@ InMemoryDataFeed<T>::InMemoryDataFeed() {
this->thread_id_ = 0;
this->thread_num_ = 1;
this->parse_ins_id_ = false;
this->parse_content_ = false;
this->input_channel_ = nullptr;
this->output_channel_ = nullptr;
this->consume_channel_ = nullptr;
......@@ -252,6 +308,11 @@ void InMemoryDataFeed<T>::SetThreadNum(int thread_num) {
thread_num_ = thread_num;
}
template <typename T>
void InMemoryDataFeed<T>::SetParseContent(bool parse_content) {
parse_content_ = parse_content;
}
template <typename T>
void InMemoryDataFeed<T>::SetParseInsId(bool parse_ins_id) {
parse_ins_id_ = parse_ins_id;
......@@ -301,7 +362,8 @@ void MultiSlotDataFeed::Init(
paddle::framework::MultiSlotDesc multi_slot_desc =
data_feed_desc.multi_slot_desc();
SetBatchSize(data_feed_desc.batch_size());
SetQueueSize(data_feed_desc.batch_size());
// temporarily set queue size = batch size * 100
SetQueueSize(data_feed_desc.batch_size() * 100);
size_t all_slot_num = multi_slot_desc.slots_size();
all_slots_.resize(all_slot_num);
all_slots_type_.resize(all_slot_num);
......@@ -610,15 +672,16 @@ void MultiSlotDataFeed::PutToFeedVec(
if (type[0] == 'f') { // float
const auto& feasign = ins_vec[i].GetFloatData();
float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
float* tensor_ptr =
feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
CopyToFeedTensor(tensor_ptr, &feasign[0], total_instance * sizeof(float));
} else if (type[0] == 'u') { // uint64
// no uint64_t type in paddlepaddle
const auto& feasign = ins_vec[i].GetUint64Data();
int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
{total_instance, 1}, this->place_);
CopyToFeedTensor(tensor_ptr, &feasign[0],
total_instance * sizeof(int64_t));
}
LoD data_lod{offset};
......@@ -709,6 +772,18 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
pos += len + 1;
VLOG(3) << "ins_id " << instance->ins_id_;
}
if (parse_content_) {
int num = strtol(&str[pos], &endptr, 10);
CHECK(num == 1); // NOLINT
pos = endptr - str + 1;
size_t len = 0;
while (str[pos + len] != ' ') {
++len;
}
instance->content_ = std::string(str + pos, len);
pos += len + 1;
VLOG(3) << "content " << instance->content_;
}
for (size_t i = 0; i < use_slots_index_.size(); ++i) {
int idx = use_slots_index_[i];
int num = strtol(&str[pos], &endptr, 10);
......@@ -833,8 +908,14 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
std::vector<std::vector<size_t>> offset(use_slots_.size(),
std::vector<size_t>{0});
std::vector<bool> visit(use_slots_.size(), false);
ins_content_vec_.clear();
ins_content_vec_.reserve(ins_vec.size());
ins_id_vec_.clear();
ins_id_vec_.reserve(ins_vec.size());
for (size_t i = 0; i < ins_vec.size(); ++i) {
auto& r = ins_vec[i];
ins_id_vec_.push_back(r.ins_id_);
ins_content_vec_.push_back(r.content_);
for (auto& item : r.float_feasigns_) {
batch_float_feasigns[item.slot()].push_back(item.sign().float_feasign_);
visit[item.slot()] = true;
......@@ -872,15 +953,15 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
const auto& type = all_slots_type_[i];
if (type[0] == 'f') { // float
float* feasign = batch_float_feasigns[i].data();
float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, feasign, total_instance * sizeof(float));
float* tensor_ptr =
feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(float));
} else if (type[0] == 'u') { // uint64
// no uint64_t type in paddlepaddle
uint64_t* feasign = batch_uint64_feasigns[i].data();
int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, feasign, total_instance * sizeof(int64_t));
{total_instance, 1}, this->place_);
CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(int64_t));
}
auto& slot_offset = offset[i];
LoD data_lod{slot_offset};
......@@ -906,15 +987,16 @@ void PrivateInstantDataFeed<T>::PutToFeedVec() {
if (type[0] == 'f') { // float
const auto& feasign = ins_vec_[i].GetFloatData();
float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
float* tensor_ptr =
feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
CopyToFeedTensor(tensor_ptr, &feasign[0], total_instance * sizeof(float));
} else if (type[0] == 'u') { // uint64
// no uint64_t type in paddlepaddle
const auto& feasign = ins_vec_[i].GetUint64Data();
int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
{total_instance, 1}, this->place_);
CopyToFeedTensor(tensor_ptr, &feasign[0],
total_instance * sizeof(int64_t));
}
LoD data_lod{offset};
......
......@@ -26,6 +26,7 @@ limitations under the License. */
#include <sstream>
#include <string>
#include <thread> // NOLINT
#include <unordered_map>
#include <utility>
#include <vector>
......@@ -104,13 +105,25 @@ class DataFeed {
virtual void SetThreadNum(int thread_num) {}
// This function will do nothing at default
virtual void SetParseInsId(bool parse_ins_id) {}
virtual void SetParseContent(bool parse_content) {}
virtual void SetFileListMutex(std::mutex* mutex) {
mutex_for_pick_file_ = mutex;
}
virtual void SetFileListIndex(size_t* file_index) { file_idx_ = file_index; }
virtual const std::vector<std::string>& GetInsIdVec() const {
return ins_id_vec_;
}
virtual const std::vector<std::string>& GetInsContentVec() const {
return ins_content_vec_;
}
virtual int GetCurBatchSize() { return batch_size_; }
virtual void LoadIntoMemory() {
PADDLE_THROW("This function(LoadIntoMemory) is not implemented.");
}
virtual void SetPlace(const paddle::platform::Place& place) {
place_ = place;
}
virtual const paddle::platform::Place& GetPlace() const { return place_; }
protected:
// The following three functions are used to check if it is executed in this
......@@ -124,6 +137,7 @@ class DataFeed {
// This function is used to pick one file from the global filelist(thread
// safe).
virtual bool PickOneFile(std::string* filename);
virtual void CopyToFeedTensor(void* dst, const void* src, size_t size);
std::vector<std::string> filelist_;
size_t* file_idx_;
......@@ -158,6 +172,9 @@ class DataFeed {
bool finish_set_filelist_;
bool finish_start_;
std::string pipe_command_;
std::vector<std::string> ins_id_vec_;
std::vector<std::string> ins_content_vec_;
platform::Place place_;
};
// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
......@@ -215,6 +232,7 @@ class InMemoryDataFeed : public DataFeed {
virtual void SetThreadId(int thread_id);
virtual void SetThreadNum(int thread_num);
virtual void SetParseInsId(bool parse_ins_id);
virtual void SetParseContent(bool parse_content);
virtual void LoadIntoMemory();
protected:
......@@ -225,6 +243,7 @@ class InMemoryDataFeed : public DataFeed {
int thread_id_;
int thread_num_;
bool parse_ins_id_;
bool parse_content_;
std::ifstream file_;
std::shared_ptr<FILE> fp_;
paddle::framework::ChannelObject<T>* input_channel_;
......@@ -419,6 +438,42 @@ struct Record {
std::vector<FeatureItem> uint64_feasigns_;
std::vector<FeatureItem> float_feasigns_;
std::string ins_id_;
std::string content_;
};
struct RecordCandidate {
std::string ins_id_;
std::unordered_multimap<uint16_t, FeatureKey> feas;
RecordCandidate& operator=(const Record& rec) {
feas.clear();
ins_id_ = rec.ins_id_;
for (auto& fea : rec.uint64_feasigns_) {
feas.insert({fea.slot(), fea.sign()});
}
return *this;
}
};
class RecordCandidateList {
public:
RecordCandidateList() = default;
RecordCandidateList(const RecordCandidateList&) = delete;
RecordCandidateList& operator=(const RecordCandidateList&) = delete;
void ReSize(size_t length);
void ReInit();
void AddAndGet(const Record& record, RecordCandidate* result);
private:
size_t _capacity = 0;
std::mutex _mutex;
bool _full = false;
size_t _cur_size = 0;
size_t _total_size = 0;
std::vector<RecordCandidate> _candidate_list;
};
template <class AR>
......
......@@ -18,7 +18,6 @@
#include "paddle/fluid/operators/math/math_function.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
#endif
......@@ -121,28 +120,35 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
const Tensor& in, Tensor* out) {
auto in_layout = kernel_type_for_var.data_layout_;
auto out_layout = expected_kernel_type.data_layout_;
auto place = expected_kernel_type.place_;
PADDLE_ENFORCE(
in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN,
"TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
"non-MKLDNN");
innerTransDataLayoutFromMKLDNN(in_layout, out_layout, in, out, place);
}
void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
const Tensor& in, Tensor* out,
platform::Place place) {
#ifdef PADDLE_WITH_MKLDNN
PADDLE_ENFORCE(in.format() != memory::format::format_undef &&
in.format() != memory::format::any,
"Input tensor should have specified memory format");
PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::format_undef,
"Input tensor should have specified memory format");
PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::any,
"Input tensor should have specified memory format");
// Set default as NCHW in case not specified
out_layout =
out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
auto& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
pool.Get(expected_kernel_type.place_));
auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(pool.Get(place));
auto& cpu_engine = dev_ctx->GetEngine();
std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
std::vector<int> out_tz = in_tz;
auto in_tz = paddle::framework::vectorize<int>(in.dims());
auto out_tz = in_tz;
memory::data_type in_type = ToMKLDNNDataType(in.type());
PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
......@@ -157,15 +163,15 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
if (in_format != out_format) {
void* in_data = GetDataFromTensor(in, in_type);
const std::string key = platform::ReorderMKLDNNHandler::GetHash(
in_tz, in_format, out_format, std::to_string(in_type));
const std::string key = platform::CreateKey(in_tz, in_format, out_format,
std::to_string(in_type));
platform::ReorderMKLDNNHandler handler(in_tz, in.type(), in_type, *dev_ctx,
cpu_engine, key);
auto reorder_src_memory_p = handler.AcquireSrcMemory(in_format, in_data);
auto reorder_dst_memory_p =
handler.AcquireDstMemory(out, out_format, expected_kernel_type.place_);
handler.AcquireDstMemory(out, out_format, place);
auto reorder_p =
handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
......@@ -177,7 +183,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
}
out->set_layout(out_layout);
// reset format since the out tensor will be feed to non-MKLDNN OPkernel
out->set_format(memory::format::format_undef);
out->set_format(MKLDNNMemoryFormat::format_undef);
#endif
}
......
......@@ -21,30 +21,33 @@
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace framework {
#ifdef PADDLE_WITH_MKLDNN
using MKLDNNFormat = mkldnn::memory::format;
using MKLDNNDataType = mkldnn::memory::data_type;
inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) {
inline MKLDNNMemoryFormat ToMKLDNNFormat(const DataLayout& layout) {
switch (layout) {
case DataLayout::kNHWC:
return MKLDNNFormat::nhwc;
return MKLDNNMemoryFormat::nhwc;
case DataLayout::kNCHW:
return MKLDNNFormat::nchw;
return MKLDNNMemoryFormat::nchw;
default:
PADDLE_THROW("Fail to convert layout %s to MKLDNN format",
DataLayoutToString(layout));
}
}
inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) {
inline DataLayout ToPaddleLayout(const MKLDNNMemoryFormat& format) {
switch (format) {
case MKLDNNFormat::nhwc:
case MKLDNNMemoryFormat::nhwc:
return DataLayout::kNHWC;
case MKLDNNFormat::nchw:
case MKLDNNMemoryFormat::nchw:
return DataLayout::kNCHW;
default:
PADDLE_THROW("Fail to convert MKLDNN format to paddle layout");
......@@ -69,6 +72,10 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_type,
const Tensor& in, Tensor* out);
void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
const Tensor& in, Tensor* out,
platform::Place place);
std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
void TransDataLayout(const OpKernelType& kernel_type_for_var,
......
......@@ -42,12 +42,16 @@ DatasetImpl<T>::DatasetImpl() {
channel_num_ = 1;
file_idx_ = 0;
cur_channel_ = 0;
fleet_send_batch_size_ = 80000;
fleet_send_sleep_seconds_ = 2;
fleet_send_batch_size_ = 1024;
fleet_send_sleep_seconds_ = 0;
merge_by_insid_ = false;
erase_duplicate_feas_ = true;
keep_unmerged_ins_ = true;
min_merge_size_ = 2;
parse_ins_id_ = false;
parse_content_ = false;
preload_thread_num_ = 0;
global_index_ = 0;
}
// set filelist, file_idx_ will reset to zero.
......@@ -103,17 +107,36 @@ void DatasetImpl<T>::SetChannelNum(int channel_num) {
channel_num_ = channel_num;
}
template <typename T>
void DatasetImpl<T>::SetParseInsId(bool parse_ins_id) {
parse_ins_id_ = parse_ins_id;
}
template <typename T>
void DatasetImpl<T>::SetParseContent(bool parse_content) {
parse_content_ = parse_content;
}
template <typename T>
void DatasetImpl<T>::SetMergeByInsId(
const std::vector<std::string>& merge_slot_list, bool erase_duplicate_feas,
int min_merge_size, bool keep_unmerged_ins) {
merge_by_insid_ = true;
parse_ins_id_ = true;
merge_slots_list_ = merge_slot_list;
erase_duplicate_feas_ = erase_duplicate_feas;
min_merge_size_ = min_merge_size;
keep_unmerged_ins_ = keep_unmerged_ins;
}
template <typename T>
void DatasetImpl<T>::SetFeaEval(bool fea_eval, int record_candidate_size) {
slots_shuffle_fea_eval_ = fea_eval;
slots_shuffle_rclist_.ReSize(record_candidate_size);
VLOG(3) << "SetFeaEval fea eval mode: " << fea_eval
<< " with record candidate size: " << record_candidate_size;
}
template <typename T>
std::vector<paddle::framework::DataFeed*> DatasetImpl<T>::GetReaders() {
std::vector<paddle::framework::DataFeed*> ret;
......@@ -182,10 +205,21 @@ void DatasetImpl<T>::LoadIntoMemory() {
template <typename T>
void DatasetImpl<T>::PreLoadIntoMemory() {
VLOG(3) << "DatasetImpl<T>::PreLoadIntoMemory() begin";
preload_threads_.clear();
for (int64_t i = 0; i < thread_num_; ++i) {
preload_threads_.push_back(std::thread(
&paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
if (preload_thread_num_ != 0) {
CHECK(preload_thread_num_ == preload_readers_.size());
preload_threads_.clear();
for (int64_t i = 0; i < preload_thread_num_; ++i) {
preload_threads_.push_back(
std::thread(&paddle::framework::DataFeed::LoadIntoMemory,
preload_readers_[i].get()));
}
} else {
CHECK(thread_num_ == readers_.size());
preload_threads_.clear();
for (int64_t i = 0; i < thread_num_; ++i) {
preload_threads_.push_back(std::thread(
&paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get()));
}
}
VLOG(3) << "DatasetImpl<T>::PreLoadIntoMemory() end";
}
......@@ -258,7 +292,7 @@ void DatasetImpl<T>::LocalShuffle() {
}
template <typename T>
void DatasetImpl<T>::GlobalShuffle() {
void DatasetImpl<T>::GlobalShuffle(int thread_num) {
VLOG(3) << "DatasetImpl<T>::GlobalShuffle() begin";
platform::Timer timeline;
timeline.Start();
......@@ -325,13 +359,21 @@ void DatasetImpl<T>::GlobalShuffle() {
ars.shrink_to_fit();
data.clear();
data.shrink_to_fit();
sleep(this->fleet_send_sleep_seconds_);
// currently we find bottleneck is server not able to handle large data
// in time, so we can remove this sleep and set fleet_send_batch_size to
// 1024, and set server thread to 24.
if (fleet_send_sleep_seconds_ != 0) {
sleep(this->fleet_send_sleep_seconds_);
}
}
};
VLOG(3) << "start global shuffle threads";
std::vector<std::thread> global_shuffle_threads;
for (int i = 0; i < thread_num_; ++i) {
if (thread_num == -1) {
thread_num = thread_num_;
}
VLOG(3) << "start global shuffle threads, num = " << thread_num;
for (int i = 0; i < thread_num; ++i) {
global_shuffle_threads.push_back(std::thread(global_shuffle_func));
}
for (std::thread& t : global_shuffle_threads) {
......@@ -345,6 +387,101 @@ void DatasetImpl<T>::GlobalShuffle() {
<< timeline.ElapsedSec() << " seconds";
}
template <typename T>
void DatasetImpl<T>::DynamicAdjustChannelNum(int channel_num) {
if (channel_num_ == channel_num) {
VLOG(3) << "DatasetImpl<T>::DynamicAdjustChannelNum channel_num_="
<< channel_num_ << ", channel_num_=channel_num, no need to adjust";
return;
}
VLOG(3) << "adjust channel num from " << channel_num_ << " to "
<< channel_num;
channel_num_ = channel_num;
std::vector<paddle::framework::Channel<T>>* origin_channels = nullptr;
std::vector<paddle::framework::Channel<T>>* other_channels = nullptr;
// find out which channel (output or consume) has data
int cur_channel = 0;
uint64_t output_channels_data_size = 0;
uint64_t consume_channels_data_size = 0;
CHECK(multi_output_channel_.size() == multi_consume_channel_.size());
for (int i = 0; i < multi_output_channel_.size(); ++i) {
output_channels_data_size += multi_output_channel_[i]->Size();
consume_channels_data_size += multi_consume_channel_[i]->Size();
}
if (output_channels_data_size != 0) {
CHECK(consume_channels_data_size == 0); // NOLINT
cur_channel = 0;
} else {
CHECK(output_channels_data_size == 0); // NOLINT
cur_channel = 1;
}
if (cur_channel == 0) {
origin_channels = &multi_output_channel_;
other_channels = &multi_consume_channel_;
} else {
origin_channels = &multi_consume_channel_;
other_channels = &multi_output_channel_;
}
CHECK(origin_channels != nullptr); // NOLINT
CHECK(other_channels != nullptr); // NOLINT
paddle::framework::Channel<T> total_data_channel =
paddle::framework::MakeChannel<T>();
std::vector<paddle::framework::Channel<T>> new_channels;
std::vector<paddle::framework::Channel<T>> new_other_channels;
std::vector<T> local_vec;
for (int i = 0; i < origin_channels->size(); ++i) {
local_vec.clear();
(*origin_channels)[i]->Close();
(*origin_channels)[i]->ReadAll(local_vec);
total_data_channel->Write(std::move(local_vec));
}
total_data_channel->Close();
total_data_channel->SetBlockSize(total_data_channel->Size() / channel_num +
1);
for (int i = 0; i < channel_num; ++i) {
local_vec.clear();
total_data_channel->Read(local_vec);
new_other_channels.push_back(paddle::framework::MakeChannel<T>());
new_channels.push_back(paddle::framework::MakeChannel<T>());
new_channels[i]->Write(std::move(local_vec));
}
total_data_channel->Clear();
origin_channels->clear();
other_channels->clear();
*origin_channels = new_channels;
*other_channels = new_other_channels;
new_channels.clear();
new_other_channels.clear();
std::vector<paddle::framework::Channel<T>>().swap(new_channels);
std::vector<paddle::framework::Channel<T>>().swap(new_other_channels);
local_vec.clear();
std::vector<T>().swap(local_vec);
VLOG(3) << "adjust channel num done";
}
template <typename T>
void DatasetImpl<T>::DynamicAdjustReadersNum(int thread_num) {
if (thread_num_ == thread_num) {
VLOG(3) << "DatasetImpl<T>::DynamicAdjustReadersNum thread_num_="
<< thread_num_ << ", thread_num_=thread_num, no need to adjust";
return;
}
VLOG(3) << "adjust readers num from " << thread_num_ << " to " << thread_num;
thread_num_ = thread_num;
std::vector<std::shared_ptr<paddle::framework::DataFeed>>().swap(readers_);
CreateReaders();
VLOG(3) << "adjust readers num done";
}
template <typename T>
void DatasetImpl<T>::SetFleetSendSleepSeconds(int seconds) {
fleet_send_sleep_seconds_ = seconds;
}
template <typename T>
void DatasetImpl<T>::CreateReaders() {
VLOG(3) << "Calling CreateReaders()";
......@@ -352,8 +489,6 @@ void DatasetImpl<T>::CreateReaders() {
VLOG(3) << "Filelist size in Dataset: " << filelist_.size();
VLOG(3) << "channel num in Dataset: " << channel_num_;
CHECK(thread_num_ > 0) << "thread num should > 0";
CHECK(thread_num_ <= filelist_.size())
<< "thread num should <= filelist size";
CHECK(channel_num_ > 0) << "channel num should > 0";
CHECK(channel_num_ <= thread_num_) << "channel num should <= thread num";
VLOG(3) << "readers size: " << readers_.size();
......@@ -372,7 +507,8 @@ void DatasetImpl<T>::CreateReaders() {
readers_[i]->SetFileListMutex(&mutex_for_pick_file_);
readers_[i]->SetFileListIndex(&file_idx_);
readers_[i]->SetFileList(filelist_);
readers_[i]->SetParseInsId(merge_by_insid_);
readers_[i]->SetParseInsId(parse_ins_id_);
readers_[i]->SetParseContent(parse_content_);
if (input_channel_ != nullptr) {
readers_[i]->SetInputChannel(input_channel_.get());
}
......@@ -401,6 +537,47 @@ void DatasetImpl<T>::DestroyReaders() {
cur_channel_ = 1 - cur_channel_;
}
template <typename T>
void DatasetImpl<T>::SetPreLoadThreadNum(int thread_num) {
preload_thread_num_ = thread_num;
}
template <typename T>
void DatasetImpl<T>::CreatePreLoadReaders() {
VLOG(3) << "Begin CreatePreLoadReaders";
if (preload_thread_num_ == 0) {
preload_thread_num_ = thread_num_;
}
CHECK(preload_thread_num_ > 0) << "thread num should > 0";
CHECK(input_channel_ != nullptr);
preload_readers_.clear();
for (int i = 0; i < preload_thread_num_; ++i) {
preload_readers_.push_back(
DataFeedFactory::CreateDataFeed(data_feed_desc_.name()));
preload_readers_[i]->Init(data_feed_desc_);
preload_readers_[i]->SetThreadId(i);
preload_readers_[i]->SetThreadNum(preload_thread_num_);
preload_readers_[i]->SetFileListMutex(&mutex_for_pick_file_);
preload_readers_[i]->SetFileListIndex(&file_idx_);
preload_readers_[i]->SetFileList(filelist_);
preload_readers_[i]->SetParseInsId(parse_ins_id_);
preload_readers_[i]->SetInputChannel(input_channel_.get());
preload_readers_[i]->SetOutputChannel(nullptr);
preload_readers_[i]->SetConsumeChannel(nullptr);
}
VLOG(3) << "End CreatePreLoadReaders";
}
template <typename T>
void DatasetImpl<T>::DestroyPreLoadReaders() {
VLOG(3) << "Begin DestroyPreLoadReaders";
preload_readers_.clear();
std::vector<std::shared_ptr<paddle::framework::DataFeed>>().swap(
preload_readers_);
file_idx_ = 0;
VLOG(3) << "End DestroyPreLoadReaders";
}
template <typename T>
int64_t DatasetImpl<T>::GetMemoryDataSize() {
return input_channel_->Size();
......@@ -436,7 +613,16 @@ int DatasetImpl<T>::ReceiveFromClient(int msg_type, int client_id,
CHECK(ar.Cursor() == ar.Finish());
auto fleet_ptr = FleetWrapper::GetInstance();
int64_t index = fleet_ptr->LocalRandomEngine()() % channel_num_;
// not use random because it doesn't perform well here.
// to make sure each channel get data equally, we just put data to
// channel one by one.
// int64_t index = fleet_ptr->LocalRandomEngine()() % channel_num_;
int64_t index = 0;
{
std::unique_lock<std::mutex> lk(global_index_mutex_);
index = global_index_++;
}
index = index % channel_num_;
VLOG(3) << "ramdom index=" << index;
multi_output_channel_[index]->Write(std::move(data));
......@@ -648,5 +834,167 @@ void MultiSlotDataset::MergeByInsId() {
VLOG(3) << "MultiSlotDataset::MergeByInsId end";
}
void MultiSlotDataset::GetRandomData(const std::set<uint16_t>& slots_to_replace,
std::vector<Record>* result) {
int debug_erase_cnt = 0;
int debug_push_cnt = 0;
auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
slots_shuffle_rclist_.ReInit();
for (const auto& rec : slots_shuffle_original_data_) {
RecordCandidate rand_rec;
Record new_rec = rec;
slots_shuffle_rclist_.AddAndGet(rec, &rand_rec);
for (auto it = new_rec.uint64_feasigns_.begin();
it != new_rec.uint64_feasigns_.end();) {
if (slots_to_replace.find(it->slot()) != slots_to_replace.end()) {
it = new_rec.uint64_feasigns_.erase(it);
debug_erase_cnt += 1;
} else {
++it;
}
}
for (auto slot : slots_to_replace) {
auto range = rand_rec.feas.equal_range(slot);
for (auto it = range.first; it != range.second; ++it) {
new_rec.uint64_feasigns_.push_back({it->second, it->first});
debug_push_cnt += 1;
}
}
result->push_back(std::move(new_rec));
}
VLOG(2) << "erase feasign num: " << debug_erase_cnt
<< " repush feasign num: " << debug_push_cnt;
}
// slots shuffle to input_channel_ with needed-shuffle slots
void MultiSlotDataset::SlotsShuffle(
const std::set<std::string>& slots_to_replace) {
int out_channel_size = 0;
if (cur_channel_ == 0) {
for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
out_channel_size += multi_output_channel_[i]->Size();
}
} else {
for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
out_channel_size += multi_consume_channel_[i]->Size();
}
}
VLOG(2) << "DatasetImpl<T>::SlotsShuffle() begin with input channel size: "
<< input_channel_->Size()
<< " output channel size: " << out_channel_size;
if (!slots_shuffle_fea_eval_) {
VLOG(3) << "DatasetImpl<T>::SlotsShuffle() end,"
"fea eval mode off, need to set on for slots shuffle";
return;
}
if ((!input_channel_ || input_channel_->Size() == 0) &&
slots_shuffle_original_data_.size() == 0 && out_channel_size == 0) {
VLOG(3) << "DatasetImpl<T>::SlotsShuffle() end, no data to slots shuffle";
return;
}
platform::Timer timeline;
timeline.Start();
auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
std::set<uint16_t> index_slots;
for (size_t i = 0; i < multi_slot_desc.slots_size(); ++i) {
std::string cur_slot = multi_slot_desc.slots(i).name();
if (slots_to_replace.find(cur_slot) != slots_to_replace.end()) {
index_slots.insert(i);
}
}
if (slots_shuffle_original_data_.size() == 0) {
// before first slots shuffle, instances could be in
// input_channel, oupput_channel or consume_channel
if (input_channel_ && input_channel_->Size() != 0) {
slots_shuffle_original_data_.reserve(input_channel_->Size());
input_channel_->Close();
input_channel_->ReadAll(slots_shuffle_original_data_);
} else {
CHECK(out_channel_size > 0); // NOLINT
if (cur_channel_ == 0) {
for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
std::vector<Record> vec_data;
multi_output_channel_[i]->Close();
multi_output_channel_[i]->ReadAll(vec_data);
slots_shuffle_original_data_.reserve(
slots_shuffle_original_data_.size() + vec_data.size());
slots_shuffle_original_data_.insert(
slots_shuffle_original_data_.end(),
std::make_move_iterator(vec_data.begin()),
std::make_move_iterator(vec_data.end()));
vec_data.clear();
vec_data.shrink_to_fit();
multi_output_channel_[i]->Clear();
}
} else {
for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
std::vector<Record> vec_data;
multi_consume_channel_[i]->Close();
multi_consume_channel_[i]->ReadAll(vec_data);
slots_shuffle_original_data_.reserve(
slots_shuffle_original_data_.size() + vec_data.size());
slots_shuffle_original_data_.insert(
slots_shuffle_original_data_.end(),
std::make_move_iterator(vec_data.begin()),
std::make_move_iterator(vec_data.end()));
vec_data.clear();
vec_data.shrink_to_fit();
multi_consume_channel_[i]->Clear();
}
}
}
} else {
// if already have original data for slots shuffle, clear channel
input_channel_->Clear();
if (cur_channel_ == 0) {
for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
if (!multi_output_channel_[i]) {
continue;
}
multi_output_channel_[i]->Clear();
}
} else {
for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
if (!multi_consume_channel_[i]) {
continue;
}
multi_consume_channel_[i]->Clear();
}
}
}
int end_size = 0;
if (cur_channel_ == 0) {
for (size_t i = 0; i < multi_output_channel_.size(); ++i) {
if (!multi_output_channel_[i]) {
continue;
}
end_size += multi_output_channel_[i]->Size();
}
} else {
for (size_t i = 0; i < multi_consume_channel_.size(); ++i) {
if (!multi_consume_channel_[i]) {
continue;
}
end_size += multi_consume_channel_[i]->Size();
}
}
CHECK(input_channel_->Size() == 0)
<< "input channel should be empty before slots shuffle";
std::vector<Record> random_data;
random_data.clear();
// get slots shuffled random_data
GetRandomData(index_slots, &random_data);
input_channel_->Open();
input_channel_->Write(std::move(random_data));
random_data.clear();
random_data.shrink_to_fit();
input_channel_->Close();
timeline.Pause();
VLOG(2) << "DatasetImpl<T>::SlotsShuffle() end"
<< ", memory data size for slots shuffle=" << input_channel_->Size()
<< ", cost time=" << timeline.ElapsedSec() << " seconds";
}
} // end namespace framework
} // end namespace paddle
......@@ -17,6 +17,7 @@
#include <fstream>
#include <memory>
#include <mutex> // NOLINT
#include <set>
#include <string>
#include <thread> // NOLINT
#include <utility>
......@@ -57,10 +58,15 @@ class Dataset {
virtual void SetDataFeedDesc(const std::string& data_feed_desc_str) = 0;
// set channel num
virtual void SetChannelNum(int channel_num) = 0;
// set parse ins id
virtual void SetParseInsId(bool parse_ins_id) = 0;
virtual void SetParseContent(bool parse_content) = 0;
// set merge by ins id
virtual void SetMergeByInsId(const std::vector<std::string>& merge_slot_list,
bool erase_duplicate_feas, int min_merge_size,
bool keep_unmerged_ins) = 0;
// set fea eval mode
virtual void SetFeaEval(bool fea_eval, int record_candidate_size) = 0;
// get file list
virtual const std::vector<std::string>& GetFileList() = 0;
// get thread num
......@@ -93,7 +99,11 @@ class Dataset {
// local shuffle data
virtual void LocalShuffle() = 0;
// global shuffle data
virtual void GlobalShuffle() = 0;
virtual void GlobalShuffle(int thread_num = -1) = 0;
// for slots shuffle
virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace) = 0;
virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
std::vector<Record>* result) = 0;
// create readers
virtual void CreateReaders() = 0;
// destroy readers
......@@ -104,6 +114,17 @@ class Dataset {
virtual int64_t GetShuffleDataSize() = 0;
// merge by ins id
virtual void MergeByInsId() = 0;
// create preload readers
virtual void CreatePreLoadReaders() = 0;
// destroy preload readers after prelaod done
virtual void DestroyPreLoadReaders() = 0;
// set preload thread num
virtual void SetPreLoadThreadNum(int thread_num) = 0;
// seperate train thread and dataset thread
virtual void DynamicAdjustChannelNum(int channel_num) = 0;
virtual void DynamicAdjustReadersNum(int thread_num) = 0;
// set fleet send sleep seconds
virtual void SetFleetSendSleepSeconds(int seconds) = 0;
protected:
virtual int ReceiveFromClient(int msg_type, int client_id,
......@@ -126,13 +147,17 @@ class DatasetImpl : public Dataset {
const std::string& fs_ugi);
virtual void SetDataFeedDesc(const std::string& data_feed_desc_str);
virtual void SetChannelNum(int channel_num);
virtual void SetParseInsId(bool parse_ins_id);
virtual void SetParseContent(bool parse_content);
virtual void SetMergeByInsId(const std::vector<std::string>& merge_slot_list,
bool erase_duplicate_feas, int min_merge_size,
bool keep_unmerged_ins);
virtual void SetFeaEval(bool fea_eval, int record_candidate_size);
virtual const std::vector<std::string>& GetFileList() { return filelist_; }
virtual int GetThreadNum() { return thread_num_; }
virtual int GetTrainerNum() { return trainer_num_; }
virtual Channel<T> GetInputChannel() { return input_channel_; }
virtual int64_t GetFleetSendBatchSize() { return fleet_send_batch_size_; }
virtual std::pair<std::string, std::string> GetHdfsConfig() {
return std::make_pair(fs_name_, fs_ugi_);
......@@ -149,17 +174,27 @@ class DatasetImpl : public Dataset {
virtual void WaitPreLoadDone();
virtual void ReleaseMemory();
virtual void LocalShuffle();
virtual void GlobalShuffle();
virtual void GlobalShuffle(int thread_num = -1);
virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace) {}
virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
std::vector<Record>* result) {}
virtual void CreateReaders();
virtual void DestroyReaders();
virtual int64_t GetMemoryDataSize();
virtual int64_t GetShuffleDataSize();
virtual void MergeByInsId() {}
virtual void CreatePreLoadReaders();
virtual void DestroyPreLoadReaders();
virtual void SetPreLoadThreadNum(int thread_num);
virtual void DynamicAdjustChannelNum(int channel_num);
virtual void DynamicAdjustReadersNum(int thread_num);
virtual void SetFleetSendSleepSeconds(int seconds);
protected:
virtual int ReceiveFromClient(int msg_type, int client_id,
const std::string& msg);
std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers_;
std::vector<std::shared_ptr<paddle::framework::DataFeed>> preload_readers_;
paddle::framework::Channel<T> input_channel_;
int channel_num_;
std::vector<paddle::framework::Channel<T>> multi_output_channel_;
......@@ -168,6 +203,8 @@ class DatasetImpl : public Dataset {
// and when finish reading, we set cur_channel = 1 - cur_channel,
// so if cur_channel=0, all data are in output_channel, else consume_channel
int cur_channel_;
std::vector<T> slots_shuffle_original_data_;
RecordCandidateList slots_shuffle_rclist_;
int thread_num_;
paddle::framework::DataFeedDesc data_feed_desc_;
int trainer_num_;
......@@ -180,10 +217,16 @@ class DatasetImpl : public Dataset {
int64_t fleet_send_sleep_seconds_;
std::vector<std::thread> preload_threads_;
bool merge_by_insid_;
bool parse_ins_id_;
bool parse_content_;
bool erase_duplicate_feas_;
bool keep_unmerged_ins_;
int min_merge_size_;
std::vector<std::string> merge_slots_list_;
bool slots_shuffle_fea_eval_ = false;
int preload_thread_num_;
std::mutex global_index_mutex_;
int64_t global_index_ = 0;
};
// use std::vector<MultiSlotType> or Record as data type
......@@ -191,6 +234,9 @@ class MultiSlotDataset : public DatasetImpl<Record> {
public:
MultiSlotDataset() {}
virtual void MergeByInsId();
virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace);
virtual void GetRandomData(const std::set<uint16_t>& slots_to_replace,
std::vector<Record>* result);
virtual ~MultiSlotDataset() {}
};
......
......@@ -48,22 +48,6 @@ bool DDim::operator==(const DDim& d) const {
bool DDim::operator!=(const DDim& d) const { return !(*this == d); }
std::vector<int64_t> vectorize(const DDim& ddim) {
std::vector<int64_t> result(DDim::kMaxRank);
dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
result.resize(ddim.size());
return result;
}
// NOTE: framework::vectorize converts to type int64_t
// which does not fit cudnn inputs.
std::vector<int> vectorize2int(const DDim& ddim) {
std::vector<int> result(DDim::kMaxRank);
dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
result.resize(ddim.size());
return result;
}
struct ProductVisitor {
template <int D>
inline int64_t operator()(const Dim<D>& dim) {
......
......@@ -170,8 +170,13 @@ DDim make_ddim(const std::vector<int>& dims);
*/
DDim make_ddim(std::initializer_list<int64_t> dims);
std::vector<int64_t> vectorize(const DDim& ddim);
std::vector<int> vectorize2int(const DDim& ddim);
template <typename T = int64_t>
std::vector<T> vectorize(const DDim& ddim) {
std::vector<T> result(DDim::kMaxRank);
dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
result.resize(ddim.size());
return result;
}
int64_t product(const DDim& ddim);
......
......@@ -3,7 +3,10 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context
cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope computation_op_handle share_tensor_buffer_functor)
cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
......@@ -59,12 +62,7 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d
cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope)
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass buffer_shared_inplace_op_pass)
if (WITH_GPU)
list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
endif()
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass buffer_shared_inplace_op_pass buffer_shared_cross_op_memory_reuse_pass)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
......@@ -82,18 +80,27 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha
device_context broadcast_op_handle)
cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context gather_op_handle)
cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
cc_library(scope_buffered_monitor SRCS scope_buffered_monitor.cc DEPS scope profiler selected_rows)
cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor scope_buffered_monitor)
#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
# device_context reduce_op_handle )
cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
if(WITH_NGRAPH)
set(NGRAPH_BS_DEPS ngraph)
else()
set(NGRAPH_BS_DEPS)
endif()
cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass
fuse_relu_depthwise_conv_pass
memory_optimize_pass lock_free_optimize_pass
lock_free_optimize_pass
coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass record_skip_memory_opt_vars_pass)
fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
${NGRAPH_BS_DEPS})
......@@ -20,12 +20,9 @@
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
// asynchronous nccl allreduce or synchronous issue:
// https://github.com/PaddlePaddle/Paddle/issues/15049
DEFINE_bool(
sync_nccl_allreduce, true,
"If set true, will call `cudaStreamSynchronize(nccl_stream)`"
"after allreduce, this mode can get better performance in some scenarios.");
#ifdef PADDLE_WITH_CUDA
DECLARE_bool(sync_nccl_allreduce);
#endif
namespace paddle {
namespace framework {
......@@ -43,11 +40,124 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
}
#endif
void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
WaitInputVarGenerated();
std::vector<VarHandleBase *> inputs = this->Inputs();
std::vector<VarHandleBase *> outputs = this->Outputs();
auto in_var_handles = DynamicCast<VarHandle>(inputs);
auto out_var_handles = DynamicCast<VarHandle>(outputs);
AllReduceImpl(in_var_handles, out_var_handles);
}
void AllReduceOpHandle::AllReduceImpl(
const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles) {
size_t num_places = places_.size();
PADDLE_ENFORCE_EQ(
in_var_handles.size(), num_places,
"The NoDummyInputSize should be equal to the number of places.");
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
PADDLE_ENFORCE_EQ(local_exec_scopes_.size(), num_places);
std::vector<const void *> lod_tensor_data;
std::vector<platform::Place> places;
lod_tensor_data.reserve(num_places);
places.reserve(num_places);
int64_t numel = -1;
bool is_gpu_place = false;
auto dtype = static_cast<framework::proto::VarType::Type>(0);
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto &local_scope = local_exec_scopes_[i];
auto var = local_scope->FindVar(in_var_handles[i]->name());
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found int scope.",
in_var_handles[i]->name());
auto &lod_tensor = var->Get<LoDTensor>();
if (i == 0) {
numel = static_cast<int64_t>(lod_tensor.numel());
dtype = lod_tensor.type();
is_gpu_place = platform::is_gpu_place(lod_tensor.place());
}
PADDLE_ENFORCE_EQ(numel, static_cast<int64_t>(lod_tensor.numel()));
PADDLE_ENFORCE_EQ(dtype, lod_tensor.type());
PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()));
lod_tensor_data.emplace_back(lod_tensor.data<void>());
places.emplace_back(lod_tensor.place());
VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
<< ", out_name:" << out_var_handles[i]->name();
PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
"The name of input and output should be equal.");
}
std::vector<std::string> grad_var_names;
grad_var_names.reserve(num_places);
for (auto &out_var : out_var_handles) {
grad_var_names.emplace_back(out_var->Name());
}
AllReduceFunc(lod_tensor_data, dtype, numel, places, grad_var_names);
}
void AllReduceOpHandle::AllReduceFunc(
std::vector<const void *> lod_tensor_data,
const framework::proto::VarType::Type &dtype, int64_t numel,
const std::vector<platform::Place> &places,
const std::vector<std::string> &out_var_names) {
if (is_gpu_place(places[0])) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto &p = places[i];
void *buffer = const_cast<void *>(lod_tensor_data.at(i));
all_reduce_calls.emplace_back([=] {
NCCLAllReduce(p, buffer, buffer, numel, nccl_dtype, ncclSum);
});
}
NCCLAllReduceFunc(all_reduce_calls);
#else
PADDLE_THROW("Not compiled with CUDA.");
#endif
} else { // Special handle CPU only Operator's gradient. Like CRF
auto &trg = *local_exec_scopes_[0]
->FindVar(out_var_names[0])
->GetMutable<LoDTensor>();
// Reduce All Tensor to trg in CPU
ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
VisitDataType(trg.type(), func);
for (size_t i = 1; i < local_exec_scopes_.size(); ++i) {
auto &scope = local_exec_scopes_[i];
auto &p = places[i];
auto *var = scope->FindVar(out_var_names[i]);
size_t size = numel * SizeOfType(trg.type());
RunAndRecordEvent(p, [&trg, var, p, size] {
auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
platform::CPUPlace cpu_place;
memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
});
}
}
VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype);
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void AllReduceOpHandle::RunAllReduceFuncs(
void AllReduceOpHandle::NCCLAllReduceFunc(
const std::vector<std::function<void()>> &all_reduce_calls) {
this->RunAndRecordEvent([&] {
if (all_reduce_calls.size() == 1UL) {
......@@ -83,85 +193,6 @@ void AllReduceOpHandle::RunAllReduceFuncs(
}
#endif
void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
WaitInputVarGenerated();
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(),
"The NoDummyInputSize should be equal to the number of places.");
PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
std::vector<const LoDTensor *> lod_tensors;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &local_scope = local_exec_scopes_[i];
auto &lod_tensor =
local_scope->FindVar(in_var_handles[i]->name())->Get<LoDTensor>();
lod_tensors.emplace_back(&lod_tensor);
VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
<< ", out_name:" << out_var_handles[i]->name();
PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
"The name of input and output should be equal.");
}
if (platform::is_gpu_place(lod_tensors[0]->place())) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
int dtype = -1;
size_t numel = 0;
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &p = places_[i];
auto &lod_tensor = *lod_tensors[i];
void *buffer = const_cast<void *>(lod_tensor.data<void>());
if (dtype == -1) {
dtype = platform::ToNCCLDataType(lod_tensor.type());
}
if (numel == 0) {
numel = static_cast<size_t>(lod_tensor.numel());
}
all_reduce_calls.emplace_back([=] {
NCCLAllReduce(p, buffer, buffer, numel,
static_cast<ncclDataType_t>(dtype), ncclSum);
});
}
VLOG(10) << "allreduce size:" << numel * SizeOfType(lod_tensors[0]->type());
RunAllReduceFuncs(all_reduce_calls);
#else
PADDLE_THROW("Not compiled with CUDA");
#endif
} else { // Special handle CPU only Operator's gradient. Like CRF
auto &trg = *this->local_exec_scopes_[0]
->FindVar(out_var_handles[0]->name())
->GetMutable<framework::LoDTensor>();
// Reduce All Tensor to trg in CPU
ReduceLoDTensor func(lod_tensors, &trg);
VisitDataType(lod_tensors[0]->type(), func);
for (size_t i = 1; i < local_scopes_.size(); ++i) {
auto &scope = local_exec_scopes_[i];
auto &p = places_[i];
auto *var = scope->FindVar(out_var_handles[i]->name());
auto *dev_ctx = dev_ctxes_.at(p);
RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
auto &tensor_cpu = trg;
TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
});
}
}
}
std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
} // namespace details
} // namespace framework
......
......@@ -61,9 +61,17 @@ class AllReduceOpHandle : public OpHandleBase {
#endif
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void RunAllReduceFuncs(
void NCCLAllReduceFunc(
const std::vector<std::function<void()>> &all_reduce_calls);
#endif
void AllReduceImpl(const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles);
void AllReduceFunc(std::vector<const void *> lod_tensor_data,
const framework::proto::VarType::Type &dtype,
int64_t numel, const std::vector<platform::Place> &places,
const std::vector<std::string> &out_var_handles);
};
} // namespace details
......
......@@ -38,8 +38,6 @@ void BroadcastOpHandle::RunImpl() {
VarHandle *in_var_handle = in_var_handles[0];
WaitInputVarGenerated();
BroadcastOneVar(*in_var_handle, out_var_handles, local_exec_scopes_);
}
......@@ -59,6 +57,7 @@ void BroadcastOpHandle::BroadcastOneVar(
InitOutputValue(in_var_handle, out_var_handles);
if (platform::is_cpu_place(in_tensor.place())) {
WaitInputVarGenerated();
for (auto *out_var_handle : out_var_handles) {
if (out_var_handle->IsTheSameVar(in_var_handle)) {
continue;
......@@ -109,6 +108,7 @@ void BroadcastOpHandle::BroadcastOneVar(
});
}
WaitInputVarGenerated();
this->RunAndRecordEvent([&] {
{
platform::NCCLGroupGuard guard;
......@@ -126,6 +126,9 @@ void BroadcastOpHandle::BroadcastOneVar(
&VariableVisitor::GetMutableTensor(out_var));
}
});
for (auto &p : places_) {
nccl_ctxs_->DevCtx(p)->Wait();
}
#else
PADDLE_THROW("CUDA is not enabled.");
#endif
......
......@@ -19,6 +19,7 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "boost/optional.hpp"
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
......@@ -88,8 +89,8 @@ struct BuildStrategy {
bool fuse_elewise_add_act_ops_{false};
// Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
// should not be sparse types
bool fuse_all_optimizer_ops_{false};
bool fuse_all_reduce_ops_{false};
boost::optional<bool> fuse_all_optimizer_ops_{boost::none};
boost::optional<bool> fuse_all_reduce_ops_{boost::none};
// fuse_relu_depthwise_conv can fuse the `relu ->
// depthwise_conv`
bool fuse_relu_depthwise_conv_{false};
......@@ -97,7 +98,7 @@ struct BuildStrategy {
// faster. Because fusing broadcast OP equals delaying the execution of all
// broadcast Ops, in this case, all nccl streams are used only for reduce
// operations for a period of time.
bool fuse_broadcast_ops_{false};
boost::optional<bool> fuse_broadcast_ops_{boost::none};
// replace batch_norm with sync_batch_norm.
bool sync_batch_norm_{false};
......@@ -108,19 +109,14 @@ struct BuildStrategy {
// FLAGS_use_mkldnn=false
std::unordered_set<std::string> mkldnn_enabled_op_types_;
// FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4
// to open them by default, we need to solve the fetch variable issue
// TODO(liuwei1031): memory_optimize depends on kStaleProgramOpDescs,
// it is not appropriate, because kStaleProgramOpDescs will be removed in the
// near future.
bool memory_optimize_{false};
// By default, memory_optimize would be opened if gc is disabled, and
// be closed if gc is enabled.
// Users can forcely enable/disable memory_optimize by setting True/False.
boost::optional<bool> memory_optimize_{boost::none};
// Turn on inplace by default.
bool enable_inplace_{true};
// TODO(zjl): Remove this flag when MemoryOptimizePass is refactored
bool use_legacy_memory_optimize_strategy_{false};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model.
......
......@@ -96,7 +96,8 @@ void EagerDeletionOpHandle::RunImpl() {
std::deque<std::shared_ptr<memory::Allocation>> garbages;
for (size_t i = 0; i < var_infos_.size(); ++i) {
auto *var_info = var_infos_[i];
if (var_info->IsSkipped() || !var_info->DecreaseRefCnt()) {
if (var_info->IsSkippedAllMemoryOptimization() ||
!var_info->DecreaseRefCnt()) {
continue;
}
......
......@@ -31,7 +31,7 @@ struct ExecutionStrategy {
// iterations the framework cleans up a local execution scope.
// In some models, the value of this parameter has a great
// influence on the performance(about 15%) of the program.
size_t num_iteration_per_drop_scope_{1};
size_t num_iteration_per_drop_scope_{100};
// At present, the kExperimental executor is the fastest in most models.
ExecutorType type_{kExperimental};
// This debug option.
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include <deque>
#include <memory>
#include <queue>
#include <string>
#include <unordered_map>
#include <unordered_set>
......@@ -191,13 +191,13 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
++remaining_;
this->pool_.enqueue([=] {
std::queue<OpHandleBase *> op_queue;
op_queue.push(op);
std::deque<OpHandleBase *> op_queue;
op_queue.push_front(op);
size_t complete = 0;
while (!op_queue.empty()) {
OpHandleBase *op_to_run = op_queue.front();
op_queue.pop();
OpHandleBase *op_to_run = op_queue.back();
op_queue.pop_back();
if (!RunOp(op_to_run, complete_q, &complete)) {
return;
......@@ -213,7 +213,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
// NOTE(zjl): op with highest priority should run
// first without switching to another thread.
if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) {
op_queue.push(pending_op);
op_queue.push_back(pending_op);
} else {
if (op_to_run == nullptr) {
op_to_run = pending_op;
......@@ -224,7 +224,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
}
}
if (op_to_run != nullptr) op_queue.push(op_to_run);
if (op_to_run != nullptr) {
op_queue.push_front(op_to_run);
}
}
--remaining_;
complete_q->Push(complete);
......
......@@ -61,12 +61,17 @@ void FetchOpHandle::RunImpl() {
var_handle->name());
auto &t = var->Get<framework::LoDTensor>();
if (platform::is_gpu_place(t.place())) {
if (t.IsInitialized() && t.numel() > 0) {
if (platform::is_gpu_place(t.place())) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(t, cpu, &tensors_[i]);
TensorCopy(t, cpu, &tensors_[i]);
#endif
} else {
tensors_[i].ShareDataWith(t);
}
} else {
tensors_[i].ShareDataWith(t);
tensors_[i].clear();
tensors_[i].Resize({0});
}
tensors_[i].set_lod(t.lod());
}
......
......@@ -33,28 +33,18 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
const platform::NCCLCommunicator *ctxs)
: NCCLOpHandleBase(node, places, ctxs),
local_scopes_(local_scopes),
num_of_all_reduce_(num_of_all_reduce) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
}
: AllReduceOpHandle(node, local_scopes, places, ctxs),
num_of_all_reduce_(num_of_all_reduce) {}
#else
FusedAllReduceOpHandle::FusedAllReduceOpHandle(
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const size_t num_of_all_reduce)
: OpHandleBase(node),
local_scopes_(local_scopes),
places_(places),
num_of_all_reduce_(num_of_all_reduce) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
}
: AllReduceOpHandle(node, local_scopes, places),
num_of_all_reduce_(num_of_all_reduce) {}
#endif
void FusedAllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
VLOG(4) << this->DebugString();
WaitInputVarGenerated();
......@@ -71,6 +61,30 @@ void FusedAllReduceOpHandle::RunImpl() {
in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal.");
// Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused.
if (InputIsInDifferentPlace(in_var_handles)) {
for (size_t j = 0; j < num_of_all_reduce_; ++j) {
std::vector<VarHandle *> dev_inputs;
std::vector<VarHandle *> dev_outputs;
dev_inputs.reserve(place_num);
dev_outputs.reserve(place_num);
for (size_t idx = 0; idx < place_num; ++idx) {
dev_inputs.emplace_back(in_var_handles.at(j * place_num + idx));
dev_outputs.emplace_back(out_var_handles.at(j * place_num + idx));
}
AllReduceImpl(dev_inputs, dev_outputs);
}
} else {
FusedAllReduceFunc(in_var_handles, out_var_handles);
}
}
void FusedAllReduceOpHandle::FusedAllReduceFunc(
const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles) {
size_t place_num = places_.size();
GradientAndLoDTensor grads_tensor;
grads_tensor.resize(place_num);
......@@ -87,14 +101,11 @@ void FusedAllReduceOpHandle::RunImpl() {
static_cast<framework::proto::VarType::Type>(0);
GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num);
if (numel == -1) {
if (scope_idx == 0) {
numel = element_num;
}
if (dtype == static_cast<framework::proto::VarType::Type>(0)) {
dtype = ele_dtype;
PADDLE_ENFORCE_NE(ele_dtype,
static_cast<framework::proto::VarType::Type>(0));
}
PADDLE_ENFORCE_EQ(ele_dtype, dtype);
// Check whether the address space is contiguous.
......@@ -134,66 +145,36 @@ void FusedAllReduceOpHandle::RunImpl() {
}
std::vector<const void *> lod_tensor_data;
lod_tensor_data.reserve(place_num);
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
auto data = grads_tensor.at(scope_idx).at(0).second->data<void>();
lod_tensor_data.emplace_back(data);
}
std::vector<std::string> grad_var_names;
grad_var_names.reserve(place_num);
for (auto &grad_t : grads_tensor) {
grad_var_names.emplace_back(grad_t.at(0).first);
}
if (platform::is_gpu_place(places_[0])) {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
int nccl_dtype = platform::ToNCCLDataType(dtype);
std::vector<std::function<void()>> all_reduce_calls;
for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto &p = places_[i];
void *buffer = const_cast<void *>(lod_tensor_data.at(i));
all_reduce_calls.emplace_back([=] {
NCCLAllReduce(p, buffer, buffer, numel,
static_cast<ncclDataType_t>(nccl_dtype), ncclSum);
});
}
AllReduceFunc(lod_tensor_data, dtype, numel, this->places_, grad_var_names);
}
VLOG(10) << "fusedallreduce size:" << numel * SizeOfType(dtype);
this->RunAndRecordEvent([&] {
if (all_reduce_calls.size() == 1UL) {
// Do not use NCCLGroup when manage NCCL by per thread per device
all_reduce_calls[0]();
} else {
platform::NCCLGroupGuard guard;
for (auto &call : all_reduce_calls) {
call();
}
bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
const std::vector<VarHandle *> &in_var_handles) const {
for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) {
auto *local_scope = local_exec_scopes_[scope_idx];
size_t place_num = places_.size();
for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name();
auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
auto &lod_tensor = var->Get<LoDTensor>();
if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
return true;
}
});
#else
PADDLE_THROW("Not compiled with CUDA");
#endif
} else {
// Special handle CPU only Operator's gradient. Like CRF
auto grad_name = grads_tensor.at(0).at(0).first;
auto &trg = *this->local_exec_scopes_[0]
->FindVar(grad_name)
->GetMutable<framework::LoDTensor>();
// Reduce All data to trg in CPU
ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
VisitDataType(trg.type(), func);
for (size_t i = 1; i < local_exec_scopes_.size(); ++i) {
auto &scope = *local_exec_scopes_[i];
auto &p = places_[i];
auto *var = scope.FindVar(grad_name);
auto *dev_ctx = dev_ctxes_.at(p);
size_t size = numel * SizeOfType(trg.type());
RunAndRecordEvent(p, [&trg, var, dev_ctx, p, size] {
auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
platform::CPUPlace cpu_place;
memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
});
}
}
return false;
}
void FusedAllReduceOpHandle::GetGradLoDTensor(
......@@ -202,12 +183,14 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
std::vector<std::pair<std::string, const LoDTensor *>> *grad_tensor) const {
auto *local_scope = local_exec_scopes_[scope_idx];
size_t place_num = places_.size();
for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name();
PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
auto &lod_tensor = local_scope->FindVar(var_name)->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx));
auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
auto &lod_tensor = var->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx),
"%s(%d) is not in the right place.", var_name, scope_idx);
grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
}
}
......
......@@ -17,6 +17,7 @@
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
......@@ -30,14 +31,14 @@ namespace framework {
namespace details {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
struct FusedAllReduceOpHandle : public NCCLOpHandleBase {
struct FusedAllReduceOpHandle : public AllReduceOpHandle {
FusedAllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const size_t num_of_all_reduce,
const platform::NCCLCommunicator *ctxs);
#else
struct FusedAllReduceOpHandle : public OpHandleBase {
struct FusedAllReduceOpHandle : public AllReduceOpHandle {
FusedAllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
......@@ -45,22 +46,10 @@ struct FusedAllReduceOpHandle : public OpHandleBase {
#endif
std::string Name() const override;
// Delay and buffer nccl_all_reduce together can significantly increase
// performance. Disable this feature by returning false.
bool IsMultiDeviceTransfer() override { return true; };
protected:
void RunImpl() override;
std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }
private:
std::vector<Scope *> local_scopes_;
#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
// NCCLOpHandleBase already have these attributes.
// Will polish it by class inheritance framework.
std::vector<platform::Place> places_;
#endif
size_t num_of_all_reduce_;
// Check the dtype of the input
......@@ -74,6 +63,12 @@ struct FusedAllReduceOpHandle : public OpHandleBase {
const std::vector<VarHandle *> &out_var_handles,
std::vector<std::pair<std::string, const LoDTensor *>>
*grad_tensor) const;
bool InputIsInDifferentPlace(
const std::vector<VarHandle *> &in_var_handles) const;
void FusedAllReduceFunc(const std::vector<VarHandle *> &in_var_handles,
const std::vector<VarHandle *> &out_var_handles);
};
} // namespace details
......
......@@ -42,6 +42,8 @@ typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
GraphVars;
constexpr char kGraphVars[] = "vars";
constexpr char kNRanks[] = "nranks";
constexpr char kPlaces[] = "places";
constexpr char kLocalScopes[] = "local_scopes";
constexpr char kNCCLCtxs[] = "nccl_ctxs";
......@@ -68,6 +70,9 @@ constexpr char kParamsAndSparseGrads[] = "params_and_sparse_grads";
typedef std::vector<ProgramDesc> ProgramDescs;
constexpr char kProgramDescs[] = "program_descs";
typedef std::unordered_set<std::string> PinnedVars;
constexpr char kPinnedVars[] = "pinned_vars";
typedef std::vector<std::vector<std::pair<std::string, std::string>>>
GroupParamsAndGrads;
constexpr char kGroupParamsAndDenseGrads[] = "group_params_dense_grads";
......
......@@ -108,6 +108,8 @@ class OpHandleBase {
ir::Node *Node() { return node_; }
const ir::Node *Node() const { return node_; }
void SetLocalExecScopes(
const std::unordered_map<Scope *, Scope *> &scope_map);
......
......@@ -78,44 +78,59 @@ struct ReduceBufferData {
}
};
inline void GatherLocalSelectedRows(
const std::vector<const SelectedRows *> &src_selecte_rows_,
const std::vector<platform::Place> &in_places,
const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
const platform::Place &out_place, SelectedRows *dst_selecte_rows) {
PADDLE_ENFORCE(!src_selecte_rows_.empty());
std::vector<Tensor> in_tensors;
std::vector<int64_t> out_rows;
for (auto in_sr_ptr : src_selecte_rows_) {
auto &in_sr = *in_sr_ptr;
in_tensors.emplace_back(in_sr.value());
out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end());
struct GatherLocalSelectedRowsFunctor {
GatherLocalSelectedRowsFunctor(
const std::vector<const SelectedRows *> &src_selected_rows,
const std::vector<platform::Place> &in_places,
const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
const platform::Place &out_place, SelectedRows *dst_selected_rows)
: dev_ctxes_(dev_ctxes),
in_places_(in_places),
out_place_(out_place),
dst_selected_rows_(dst_selected_rows) {
PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false);
std::vector<int64_t> out_rows;
for (auto in_sr_ptr : src_selected_rows) {
auto &in_sr = *in_sr_ptr;
in_tensors_.emplace_back(in_sr.value());
out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end());
}
auto &pre_in = src_selected_rows[0];
auto &dst_tensor = *dst_selected_rows_;
dst_tensor.set_height(pre_in->height());
dst_tensor.set_rows(out_rows);
size_t rows = out_rows.size();
DDim out_dim = pre_in->GetCompleteDims();
out_dim[0] = static_cast<int64_t>(rows);
dst_tensor.mutable_value()->Resize(out_dim);
dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type());
}
auto &pre_in = src_selecte_rows_[0];
auto &dst_tensor = *dst_selecte_rows;
dst_tensor.set_height(pre_in->height());
dst_tensor.set_rows(out_rows);
size_t rows = out_rows.size();
DDim out_dim = pre_in->GetCompleteDims();
out_dim[0] = static_cast<int64_t>(rows);
dst_tensor.mutable_value()->Resize(out_dim);
dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type());
Tensor *out_tensor = dst_tensor.mutable_value();
// copy
int s = 0, e = 0;
for (size_t j = 0; j < in_tensors.size(); ++j) {
e += in_tensors[j].dims()[0];
auto sub_out = out_tensor->Slice(s, e);
paddle::framework::TensorCopy(in_tensors[j], out_place,
*(dev_ctxes.at(in_places[j])), &sub_out);
s = e;
void operator()() {
auto *out_tensor = dst_selected_rows_->mutable_value();
// copy
int s = 0, e = 0;
for (size_t j = 0; j < in_tensors_.size(); ++j) {
e += in_tensors_[j].dims()[0];
auto sub_out = out_tensor->Slice(s, e);
paddle::framework::TensorCopy(in_tensors_[j], out_place_,
*(dev_ctxes_.at(in_places_[j])), &sub_out);
s = e;
}
}
}
private:
const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes_;
std::vector<platform::Place> in_places_;
std::vector<Tensor> in_tensors_;
platform::Place out_place_;
SelectedRows *dst_selected_rows_;
};
} // namespace details
} // namespace framework
......
......@@ -66,8 +66,11 @@ void ReduceOpHandle::GatherSelectedRows(
auto gathered_var_mid = scope->Var(gathered_var_name);
auto gathered_select_rows =
gathered_var_mid->GetMutable<framework::SelectedRows>();
GatherLocalSelectedRows(src_selected_rows, in_places, dev_ctxes, out_place,
gathered_select_rows);
GatherLocalSelectedRowsFunctor functor(
src_selected_rows, in_places, dev_ctxes, out_place, gathered_select_rows);
WaitInputVarGenerated();
functor();
// FIXME(gongwb): remove this Wait.
Wait(dev_ctxes);
......@@ -167,9 +170,6 @@ void ReduceOpHandle::RunImpl() {
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var);
// Wait input done, this Wait is asynchronous operation
WaitInputVarGenerated();
// NOTE: The Places of all input tensor must be all on CPU or all on GPU.
std::vector<platform::Place> in_places; // used to get dev_ctx
for (auto *in_handle : in_var_handles) {
......@@ -209,9 +209,11 @@ void ReduceOpHandle::RunImpl() {
// TODO(gongwb): add cpu support
if (collective_context.endpoints_.size() <= 1 ||
is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) {
GatherLocalSelectedRows(in_selected_rows, in_places, dev_ctxes_,
t_out_p,
out_var->GetMutable<framework::SelectedRows>());
GatherLocalSelectedRowsFunctor functor(
in_selected_rows, in_places, dev_ctxes_, t_out_p,
out_var->GetMutable<framework::SelectedRows>());
WaitInputVarGenerated();
functor();
return;
}
......@@ -236,6 +238,7 @@ void ReduceOpHandle::RunImpl() {
GetInputValues<LoDTensor>(in_var_handles, var_scopes);
if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
WaitInputVarGenerated();
this->RunAndRecordEvent([&] {
// FIXME(zcd): The order of summing is important,
// especially when the type of data is float or double.
......@@ -295,6 +298,7 @@ void ReduceOpHandle::RunImpl() {
});
}
WaitInputVarGenerated();
this->RunAndRecordEvent([&] {
platform::NCCLGroupGuard guard;
for (auto &call : all_reduce_calls) {
......
......@@ -38,13 +38,11 @@ struct ScaleLossGradFunctor {
float coeff_;
Tensor *out_;
platform::Place place_;
OpHandleBase *op_handle_;
proto::VarType::Type out_dtype_;
platform::DeviceContext *ctx_;
ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place,
OpHandleBase *op_handle, proto::VarType::Type dtype,
platform::DeviceContext *ctx)
proto::VarType::Type dtype, platform::DeviceContext *ctx)
: coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {}
template <typename OutT>
......@@ -76,11 +74,11 @@ void ScaleLossGradOpHandle::RunImpl() {
tensor->Resize(make_ddim({1}));
#ifdef PADDLE_WITH_CUDA
ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_,
ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_,
this->dev_ctxes_.at(place_));
this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
#else
ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr);
ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, nullptr);
framework::VisitDataType(out_dtype_, func);
#endif
}
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/scope_buffered_monitor.h"
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_double(local_exe_sub_scope_limit);
namespace paddle {
namespace framework {
namespace details {
static constexpr double kMB = 1 / (1024 * 1024);
static void GetTensors(Variable *var,
std::unordered_set<Tensor *> *tensor_set) {
if (var->IsType<LoDTensor>() && var->Get<LoDTensor>().IsInitialized()) {
tensor_set->insert(var->GetMutable<LoDTensor>());
} else if (var->IsType<SelectedRows>() &&
var->Get<SelectedRows>().value().IsInitialized()) {
tensor_set->insert(var->GetMutable<SelectedRows>()->mutable_value());
} else if (var->IsType<LoDTensorArray>()) {
auto *tensor_arr = var->GetMutable<LoDTensorArray>();
for (auto &t : *tensor_arr) {
if (t.IsInitialized()) {
tensor_set->insert(&t);
}
}
}
}
static void GetTensors(Scope *scope, std::unordered_set<Tensor *> *tensor_set) {
for (auto &var_name : scope->LocalVarNames()) {
GetTensors(scope->FindVar(var_name), tensor_set);
}
for (auto *kid : scope->kids()) {
GetTensors(kid, tensor_set);
}
}
static size_t GetTensorMemorySize(Scope *scope, bool clear_cpu_tensor) {
std::unordered_set<Tensor *> tensor_set;
GetTensors(scope, &tensor_set);
size_t memory_size = 0;
std::unordered_set<memory::Allocation *> allocation_set;
for (auto *tensor : tensor_set) {
if (clear_cpu_tensor && platform::is_cpu_place(tensor->place())) {
tensor->clear();
} else {
auto allocation = tensor->Holder().get();
if (!allocation_set.count(allocation)) {
memory_size += allocation->size();
allocation_set.insert(allocation);
}
}
}
return memory_size;
}
size_t GetScopeVarMemorySize(Scope *scope) {
return GetTensorMemorySize(scope, false /*clear_cpu_tensor*/);
}
ScopeBufferedMonitor::ScopeBufferedMonitor(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_exec_scopes)
: places_(places), local_exec_scopes_(local_exec_scopes) {
pre_local_exec_scopes_.resize(local_exec_scopes_.size());
post_local_exec_scopes_.resize(local_exec_scopes_.size());
}
void ScopeBufferedMonitor::Apply(const std::function<void()> &callback,
bool has_fetch) {
std::unique_ptr<platform::RecordEvent> pre_local_exec_scopes_event(
new platform::RecordEvent(
"ScopeBufferedMonitor::pre_local_exec_scopes_process"));
for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
pre_local_exec_scopes_.at(scope_id).clear();
auto scopes = local_exec_scopes_.at(scope_id)->kids();
VLOG(10) << "pre_local_exec_scopes[" << scope_id
<< "] sub-scope: " << scopes.size();
pre_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end());
}
pre_local_exec_scopes_event.reset();
callback();
std::unique_ptr<platform::RecordEvent> post_local_exec_scopes_event(
new platform::RecordEvent(
"ScopeBufferedMonitor::post_local_exec_scopes_process"));
for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
post_local_exec_scopes_.at(scope_id).clear();
auto scopes = local_exec_scopes_.at(scope_id)->kids();
VLOG(10) << "post_local_exec_scopes[" << scope_id
<< "] sub-scope: " << scopes.size();
post_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end());
}
history_local_exec_scopes_.emplace_back();
auto &incr_local_exec_scopes = history_local_exec_scopes_.back();
incr_local_exec_scopes.resize(local_exec_scopes_.size());
for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
for (auto &scope : post_local_exec_scopes_.at(scope_id)) {
if (!pre_local_exec_scopes_.at(scope_id).count(scope)) {
incr_local_exec_scopes.at(scope_id).insert(scope);
}
}
if (VLOG_IS_ON(10)) {
if (incr_local_exec_scopes.at(scope_id).size() &&
FLAGS_local_exe_sub_scope_limit > 0) {
VLOG(10)
<< "FLAGS_local_exe_sub_scope_limit is "
<< FLAGS_local_exe_sub_scope_limit
<< " MBytes now. If you don't need to limit the memory of local "
"execution scope, you should set "
"FLAGS_local_exe_sub_scope_limit=-1.";
}
std::stringstream out;
out << scope_id << " kids: ";
for (auto &scope : incr_local_exec_scopes.at(scope_id)) {
out << scope << ", ";
}
VLOG(10) << out.str();
}
}
size_t history_step = history_local_exec_scopes_.size();
if (has_fetch && history_step >= 2) {
ClearHistoryLocalExecScopes(history_step - 1);
}
// Delete CPU Memory
std::vector<size_t> gpu_memory_size_per_gpu(places_.size());
for (auto &scope_vec : history_local_exec_scopes_) {
for (size_t idx = 0; idx < scope_vec.size(); ++idx) {
for (auto &scope : scope_vec.at(idx)) {
gpu_memory_size_per_gpu.at(idx) +=
GetTensorMemorySize(scope, true /*clear_cpu_tensor*/);
}
}
}
if (VLOG_IS_ON(8)) {
for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) {
VLOG(8) << "history local exec scopes contains "
<< string::HumanReadableSize(gpu_memory_size_per_gpu.at(idx))
<< " in " << places_.at(idx);
}
}
if (FLAGS_local_exe_sub_scope_limit > 0) {
for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) {
if (gpu_memory_size_per_gpu.at(idx) / kMB >=
FLAGS_local_exe_sub_scope_limit) {
platform::DeviceContextPool::Instance().Get(places_.at(idx))->Wait();
local_exec_scopes_.at(idx)->DropKids();
}
for (auto &scope_vec : history_local_exec_scopes_) {
scope_vec.at(idx).clear();
}
}
}
}
void ScopeBufferedMonitor::ClearHistoryLocalExecScopes(size_t history_step) {
VLOG(10) << "delete pre_incr_local_exec_scopes.";
for (size_t i = 0; i < history_step; ++i) {
auto &pre_incr_local_exec_scopes = history_local_exec_scopes_.front();
for (size_t scope_idx = 0; scope_idx < pre_incr_local_exec_scopes.size();
++scope_idx) {
for (auto scope : pre_incr_local_exec_scopes[scope_idx]) {
local_exec_scopes_.at(scope_idx)->DeleteScope(scope);
}
}
history_local_exec_scopes_.pop_front();
}
}
void ScopeBufferedMonitor::ClearHistoryLocalExecScopes() {
history_local_exec_scopes_.clear();
}
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <deque>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace details {
class ScopeBufferedMonitor {
public:
ScopeBufferedMonitor(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_exec_scopes);
void Apply(const std::function<void()> &callback, bool has_fetch);
void ClearHistoryLocalExecScopes();
void ClearHistoryLocalExecScopes(size_t history_step);
private:
std::vector<platform::Place> places_;
std::vector<Scope *> local_exec_scopes_;
std::vector<std::unordered_set<Scope *>> pre_local_exec_scopes_;
std::vector<std::unordered_set<Scope *>> post_local_exec_scopes_;
std::deque<std::vector<std::unordered_set<Scope *>>>
history_local_exec_scopes_;
};
size_t GetScopeVarMemorySize(Scope *scope);
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -18,6 +18,7 @@
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -103,16 +104,15 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
int dev_id = boost::get<platform::CUDAPlace>(place).device;
auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false);
auto &nccl_ctx = nccl_ctxs->at(dev_id);
auto *dev_ctx = nccl_ctxs->DevCtx(dev_id);
auto stream = nccl_ctx.stream();
auto comm = nccl_ctx.comm_;
auto &allocator =
platform::DeviceTemporaryAllocator::Instance().Get(place, stream);
int encode_size = 2 * k * sizeof(int);
// dgc use ncclAllGather to get all the encoded data
// so the buffer need nranks.
int buf_size = nranks_ * encode_size;
auto tmp_ious_data = allocator.Allocate(buf_size);
auto tmp_ious_data = memory::Alloc(*dev_ctx, buf_size);
void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
......@@ -126,7 +126,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
});
}
RunAllReduceFuncs(all_reduce_calls);
NCCLAllReduceFunc(all_reduce_calls);
}
int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册