提交 159c4073 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into imperative_resnet

test=develop
......@@ -33,9 +33,7 @@ if(WIN32)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
endif(WIN32)
if(NOT CMAKE_CROSSCOMPILING)
find_package(CUDA QUIET)
endif(NOT CMAKE_CROSSCOMPILING)
find_package(CUDA QUIET)
find_package(Git REQUIRED)
find_package(Threads REQUIRED)
......@@ -49,7 +47,6 @@ option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FO
option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF)
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF)
option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF)
......@@ -60,11 +57,9 @@ option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_PSLIB "Compile with pslib support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
......@@ -96,37 +91,6 @@ if(NOT CMAKE_BUILD_TYPE)
FORCE)
endif()
if(ANDROID OR IOS)
if(ANDROID)
if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
endif()
endif()
set(WITH_GPU OFF CACHE STRING
"Disable GPU when cross-compiling for Android and iOS" FORCE)
set(WITH_AVX OFF CACHE STRING
"Disable AVX when cross-compiling for Android and iOS" FORCE)
set(WITH_PYTHON OFF CACHE STRING
"Disable PYTHON when cross-compiling for Android and iOS" FORCE)
set(WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android and iOS" FORCE)
set(WITH_MKL OFF CACHE STRING
"Disable MKL when cross-compiling for Android and iOS" FORCE)
set(WITH_NGRAPH OFF CACHE STRING
"Disable nGraph when cross-compiling for Android and iOS" FORCE)
set(WITH_GOLANG OFF CACHE STRING
"Disable golang when cross-compiling for Android and iOS" FORCE)
# Compile PaddlePaddle mobile inference library
if (NOT WITH_C_API)
set(WITH_C_API ON CACHE STRING
"Always compile the C_API when cross-compiling for Android and iOS" FORCE)
endif()
set(MOBILE_INFERENCE ON)
add_definitions(-DPADDLE_MOBILE_INFERENCE)
endif()
if (APPLE)
set(WITH_MKL OFF CACHE STRING
"Disable MKL for building on mac" FORCE)
......@@ -135,8 +99,6 @@ endif()
if (WIN32)
set(WITH_DISTRIBUTE OFF CACHE STRING
"Disable DISTRIBUTE when compiling for Windows" FORCE)
set(WITH_C_API OFF CACHE STRING
"Disable C_API when compiling for Windows" FORCE)
set(WITH_FLUID_ONLY ON CACHE STRING
"Enable FLUID_ONLY when compiling for Windows" FORCE)
endif()
......@@ -150,21 +112,7 @@ set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
"A path setting fluid inference shared and static libraries")
if (WITH_C_API AND WITH_PYTHON)
message(WARNING "It is suggest not embedded a python interpreter in Paddle "
"when using C-API. It will give an unpredictable behavior when using a "
"different Python interpreter from compiling.")
endif()
if (WITH_C_API)
set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
endif()
if(MOBILE_INFERENCE)
set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
else()
set(THIRD_PARTY_BUILD_TYPE Release)
endif()
set(THIRD_PARTY_BUILD_TYPE Release)
set(WITH_MKLML ${WITH_MKL})
if (NOT DEFINED WITH_MKLDNN)
......@@ -193,7 +141,6 @@ include(external/python) # download, build, install python
include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn
include(external/ngraph) # download, build, install nGraph
include(external/swig) # download, build, install swig
include(external/boost) # download boost
include(external/any) # download libn::any
include(external/eigen) # download eigen3
......@@ -312,11 +259,6 @@ if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif()
if(USE_NNPACK)
include(external/nnpack)
list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
endif(USE_NNPACK)
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
......@@ -334,9 +276,3 @@ add_subdirectory(paddle)
if(WITH_PYTHON)
add_subdirectory(python)
endif()
if(WITH_DOC)
find_package(Sphinx REQUIRED)
find_python_module(recommonmark REQUIRED)
add_subdirectory(doc)
endif()
......@@ -11,12 +11,10 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
# ENV variables
ARG WITH_GPU
ARG WITH_AVX
ARG WITH_DOC
ENV WOBOQ OFF
ENV WITH_GPU=${WITH_GPU:-ON}
ENV WITH_AVX=${WITH_AVX:-ON}
ENV WITH_DOC=${WITH_DOC:-OFF}
ENV HOME /root
# Add bash enhancements
......
FROM ubuntu:16.04
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
# ENV variables
ARG ANDROID_ABI
ARG ANDROID_API
ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
ENV ANDROID_API=${ANDROID_API:-21}
ENV HOME=/root \
ANDROID_NDK_HOME=/opt/android-ndk-linux \
ANDROID_TOOLCHAINS_DIR=/opt/toolchains
RUN apt-get update && \
apt-get install -y \
git python-dev python-pip python-numpy \
wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
apt-get clean -y
# git credential to skip password typing
RUN git config --global credential.helper store
# Fix locales to en_US.UTF-8
RUN localedef -i en_US -f UTF-8 en_US.UTF-8
RUN pip install --upgrade pip==9.0.3 && \
pip install -U 'protobuf==3.1.0' && \
pip install -U wheel sphinx && \
pip install pre-commit
# Android NDK
RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
mkdir -p /opt/android-ndk-tmp && \
cd /opt/android-ndk-tmp && \
wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
unzip -q android-ndk-r14b-linux-x86_64.zip && \
mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
rm -rf /opt/android-ndk-tmp
# - This module looks for Sphinx
# Find the Sphinx documentation generator
#
# This modules defines
# SPHINX_EXECUTABLE
# SPHINX_FOUND
find_program(SPHINX_EXECUTABLE
NAMES sphinx-build
PATHS
/usr/bin
/usr/local/bin
/opt/local/bin
DOC "Sphinx documentation generator"
)
if( NOT SPHINX_EXECUTABLE )
set(_Python_VERSIONS
2.7 2.6 2.5 2.4 2.3 2.2 2.1 2.0 1.6 1.5
)
foreach( _version ${_Python_VERSIONS} )
set( _sphinx_NAMES sphinx-build-${_version} )
find_program( SPHINX_EXECUTABLE
NAMES ${_sphinx_NAMES}
PATHS
/usr/bin
/usr/local/bin
/opt/loca/bin
DOC "Sphinx documentation generator"
)
endforeach()
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Sphinx DEFAULT_MSG
SPHINX_EXECUTABLE
)
option( SPHINX_HTML_OUTPUT "Build a single HTML with the whole content." ON )
option( SPHINX_DIRHTML_OUTPUT "Build HTML pages, but with a single directory per document." OFF )
option( SPHINX_HTMLHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in htmlhelp." OFF )
option( SPHINX_QTHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in qthelp." OFF )
option( SPHINX_DEVHELP_OUTPUT "Build HTML pages with additional information for building a documentation collection in devhelp." OFF )
option( SPHINX_EPUB_OUTPUT "Build HTML pages with additional information for building a documentation collection in epub." OFF )
option( SPHINX_LATEX_OUTPUT "Build LaTeX sources that can be compiled to a PDF document using pdflatex." OFF )
option( SPHINX_MAN_OUTPUT "Build manual pages in groff format for UNIX systems." OFF )
option( SPHINX_TEXT_OUTPUT "Build plain text files." OFF )
mark_as_advanced(
SPHINX_EXECUTABLE
SPHINX_HTML_OUTPUT
SPHINX_DIRHTML_OUTPUT
SPHINX_HTMLHELP_OUTPUT
SPHINX_QTHELP_OUTPUT
SPHINX_DEVHELP_OUTPUT
SPHINX_EPUB_OUTPUT
SPHINX_LATEX_OUTPUT
SPHINX_MAN_OUTPUT
SPHINX_TEXT_OUTPUT
)
function( Sphinx_add_target target_name builder conf cache source destination )
add_custom_target( ${target_name} ALL
COMMAND ${SPHINX_EXECUTABLE} -b ${builder}
-d ${cache}
-c ${conf}
${source}
${destination}
COMMENT "Generating sphinx documentation: ${builder}"
COMMAND cd ${destination} && ln -sf ./index_*.html index.html
)
set_property(
DIRECTORY APPEND PROPERTY
ADDITIONAL_MAKE_CLEAN_FILES
${destination}
)
endfunction()
# Target dependencies can be optionally listed at the end.
function( Sphinx_add_targets target_base_name conf source base_destination )
set( _dependencies )
foreach( arg IN LISTS ARGN )
set( _dependencies ${_dependencies} ${arg} )
endforeach()
if( ${SPHINX_HTML_OUTPUT} )
Sphinx_add_target( ${target_base_name}_html html ${conf} ${source} ${base_destination}/html )
add_dependencies( ${target_base_name}_html ${_dependencies} )
endif()
if( ${SPHINX_DIRHTML_OUTPUT} )
Sphinx_add_target( ${target_base_name}_dirhtml dirhtml ${conf} ${source} ${base_destination}/dirhtml )
add_dependencies( ${target_base_name}_dirhtml ${_dependencies} )
endif()
if( ${SPHINX_QTHELP_OUTPUT} )
Sphinx_add_target( ${target_base_name}_qthelp qthelp ${conf} ${source} ${base_destination}/qthelp )
add_dependencies( ${target_base_name}_qthelp ${_dependencies} )
endif()
if( ${SPHINX_DEVHELP_OUTPUT} )
Sphinx_add_target( ${target_base_name}_devhelp devhelp ${conf} ${source} ${base_destination}/devhelp )
add_dependencies( ${target_base_name}_devhelp ${_dependencies} )
endif()
if( ${SPHINX_EPUB_OUTPUT} )
Sphinx_add_target( ${target_base_name}_epub epub ${conf} ${source} ${base_destination}/epub )
add_dependencies( ${target_base_name}_epub ${_dependencies} )
endif()
if( ${SPHINX_LATEX_OUTPUT} )
Sphinx_add_target( ${target_base_name}_latex latex ${conf} ${source} ${base_destination}/latex )
add_dependencies( ${target_base_name}_latex ${_dependencies} )
endif()
if( ${SPHINX_MAN_OUTPUT} )
Sphinx_add_target( ${target_base_name}_man man ${conf} ${source} ${base_destination}/man )
add_dependencies( ${target_base_name}_man ${_dependencies} )
endif()
if( ${SPHINX_TEXT_OUTPUT} )
Sphinx_add_target( ${target_base_name}_text text ${conf} ${source} ${base_destination}/text )
add_dependencies( ${target_base_name}_text ${_dependencies} )
endif()
if( ${BUILD_TESTING} )
sphinx_add_target( ${target_base_name}_linkcheck linkcheck ${conf} ${source} ${base_destination}/linkcheck )
add_dependencies( ${target_base_name}_linkcheck ${_dependencies} )
endif()
endfunction()
......@@ -64,24 +64,18 @@ endif()
## Then find the reference-cblas. www.netlib.org/blas/
set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
"Folder contains reference-cblas")
if(NOT CMAKE_CROSSCOMPILING)
set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
${REFERENCE_CBLAS_ROOT}/include
/usr/include
/usr/include/cblas
)
set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
${REFERENCE_CBLAS_ROOT}/include
/usr/include
/usr/include/cblas
)
set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
${REFERENCE_CBLAS_ROOT}/lib
/usr/lib
/usr/lib/blas/reference/
/usr/lib/reference/
)
else()
# Disable the finding of reference cblas under host's system path
set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
endif()
set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
${REFERENCE_CBLAS_ROOT}/lib
/usr/lib
/usr/lib/blas/reference/
/usr/lib/reference/
)
if(WITH_SYSTEM_BLAS)
find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
......@@ -98,10 +92,3 @@ if(WITH_SYSTEM_BLAS)
message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
endif()
endif()
if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
set(CBLAS_FOUND ON)
set(CBLAS_PROVIDER vecLib)
set(CBLAS_INC_DIR ${VECLIB_INC_DIR})
add_definitions(-DPADDLE_USE_VECLIB)
endif()
......@@ -49,12 +49,10 @@ if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
if(NOT CMAKE_CROSSCOMPILING)
if(WITH_AVX AND AVX_FOUND)
set(SIMD_FLAG ${AVX_FLAG})
elseif(SSE3_FOUND)
set(SIMD_FLAG ${SSE3_FLAG})
endif()
if(WITH_AVX AND AVX_FOUND)
set(SIMD_FLAG ${AVX_FLAG})
elseif(SSE3_FOUND)
set(SIMD_FLAG ${SSE3_FLAG})
endif()
if(WIN32)
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This is a toolchain file for cross-compiling for Android, and the
# configuration refers to the open-source resposity:
# https://github.com/taka-no-me/android-cmake
# Most of the variables are compatible with that used in
# https://developer.android.com/ndk/guides/cmake.html
# The supported variables are listed belows:
#
# ANDROID_STANDALONE_TOOLCHAIN
# ANDROID_TOOLCHAIN
# ANDROID_ABI
# ANDROID_NATIVE_API_LEVEL
# ANDROID_ARM_MODE
# ANDROID_ARM_NEON
#
# For CMake >= 3.7.0, all the settings will be delivered to CMake system
# variables to let CMake do the cross-compiling configurations itself.
# More detail of cross-compiling settings
# https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html
IF(NOT ANDROID)
return()
ENDIF()
# check the exist of android standalone toolchain
IF(NOT DEFINED ANDROID_STANDALONE_TOOLCHAIN)
SET(ANDROID_STANDALONE_TOOLCHAIN $ENV{ANDROID_STANDALONE_TOOLCHAIN}
CACHE PATH "Folder holds the standalone toolchain of Android NDK")
ENDIF()
IF(NOT ANDROID_STANDALONE_TOOLCHAIN)
MESSAGE(WARNING "It is recommended to set ANDROID_STANDALONE_TOOLCHAIN to "
"use a standalone toolchain.\n"
"To cross-compile for Android, you need to:\n"
"1. Download an Android NDK from"
" https://developer.android.com/ndk/downloads/index.html\n"
"2. Setup a standalone toolchain"
"https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn\n")
ENDIF()
IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
IF(ANDROID_NATIVE_API_LEVEL MATCHES "^android-[0-9]+$")
STRING(REPLACE "android-" "" CMAKE_SYSTEM_VERSION "${CMAKE_MATCH_0}")
ELSEIF(ANDROID_NATIVE_API_LEVEL MATCHES "^[0-9]+$")
SET(CMAKE_SYSTEM_VERSION ${ANDROID_NATIVE_API_LEVEL})
ENDIF()
ENDIF()
IF(NOT DEFINED ANDROID_TOOLCHAIN)
SET(ANDROID_TOOLCHAIN clang)
ENDIF()
IF(NOT DEFINED ANDROID_ABI)
SET(ANDROID_ABI "armeabi-v7a")
ENDIF()
IF(NOT DEFINED ANDROID_ARM_MODE)
SET(ANDROID_ARM_MODE ON)
ENDIF()
IF(ANDROID_ARM_MODE)
SET(ANDROID_ARM_MODE_NAME "arm")
ELSE(ANDROID_ARM_MODE)
SET(ANDROID_ARM_MODE_NAME "thumb")
ENDIF(ANDROID_ARM_MODE)
IF(NOT DEFINED ANDROID_ARM_NEON)
SET(ANDROID_ARM_NEON ON)
ENDIF()
IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
IF("${CMAKE_VERSION}" VERSION_LESS "3.1.0")
SET(CMAKE_SYSTEM_NAME "Linux")
ENDIF()
MESSAGE(WARNING "It is recommended to use CMake >= 3.7.0 (current version: "
"${CMAKE_VERSION}), when cross-compiling for Android.")
IF(ANDROID_STANDALONE_TOOLCHAIN)
# Use standalone toolchain
SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
IF(NOT CMAKE_SYSTEM_VERSION)
SET(ANDROID_STANDALONE_TOOLCHAIN_API "")
SET(ANDROID_API_LEVEL_H_REGEX "^[\t ]*#[\t ]*define[\t ]+__ANDROID_API__[\t ]+([0-9]+)")
FILE(STRINGS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h"
ANDROID_API_LEVEL_H_CONTENT REGEX "${ANDROID_API_LEVEL_H_REGEX}")
IF(ANDROID_API_LEVEL_H_CONTENT MATCHES "${ANDROID_API_LEVEL_H_REGEX}")
SET(ANDROID_STANDALONE_TOOLCHAIN_API "${CMAKE_MATCH_1}")
ENDIF()
SET(CMAKE_SYSTEM_VERSION ${ANDROID_STANDALONE_TOOLCHAIN_API})
ENDIF()
# Toolchain
SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
ELSE(ANDROID_NDK)
# TODO: use android ndk
ENDIF()
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
IF(ANDROID_ABI STREQUAL "armeabi")
SET(CMAKE_SYSTEM_PROCESSOR armv5te)
SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi)
ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi)
ENDIF()
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
SET(CMAKE_SYSTEM_PROCESSOR aarch64)
SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android)
ELSE()
MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
ENDIF()
SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
IF(ANDROID_TOOLCHAIN STREQUAL clang)
SET(ANDROID_C_COMPILER_NAME clang)
SET(ANDROID_CXX_COMPILER_NAME clang++)
SET(CMAKE_C_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc)
SET(ANDROID_C_COMPILER_NAME gcc)
SET(ANDROID_CXX_COMPILER_NAME g++)
ELSE()
MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}")
ENDIF()
# C compiler
IF(NOT CMAKE_C_COMPILER)
SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}")
ELSE()
GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
ENDIF()
IF(NOT EXISTS ${ANDROID_C_COMPILER})
MESSAGE(FATAL_ERROR "Cannot find C compiler: ${ANDROID_C_COMPILER}")
ENDIF()
# CXX compiler
IF(NOT CMAKE_CXX_COMPILER)
SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}")
ELSE()
GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
ENDIF()
IF(NOT EXISTS ${ANDROID_CXX_COMPILER})
MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${ANDROID_CXX_COMPILER}")
ENDIF()
SET(CMAKE_C_COMPILER ${ANDROID_C_COMPILER} CACHE PATH "C compiler" FORCE)
SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
# Toolchain and ABI specific flags.
SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections")
SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
IF(ANDROID_ABI STREQUAL "armeabi")
LIST(APPEND ANDROID_COMPILER_FLAGS
-march=armv5te
-mtune=xscale
-msoft-float)
ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
LIST(APPEND ANDROID_COMPILER_FLAGS
-march=armv7-a
-mfloat-abi=softfp)
IF(ANDROID_ARM_NEON)
LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=neon)
ELSE()
LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
ENDIF()
LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
ENDIF()
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
IF(ANDROID_ARM_MODE)
LIST(APPEND ANDROID_COMPILER_FLAGS -marm)
ELSE()
LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
ENDIF()
IF(ANDROID_TOOLCHAIN STREQUAL clang)
# Disable integrated-as for better compatibility.
LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as)
ENDIF()
ENDIF()
IF(ANDROID_TOOLCHAIN STREQUAL clang)
# CMake automatically forwards all compiler flags to the linker,
# and clang doesn't like having -Wa flags being used for linking.
# To prevent CMake from doing this would require meddling with
# the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments)
ENDIF()
STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}")
SET(CMAKE_C_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_C_FLAGS}"
CACHE STRING "C flags")
SET(CMAKE_CXX_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}"
CACHE STRING "CXX flags")
SET(CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}"
CACHE STRING "shared linker flags")
SET(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
SET(CMAKE_EXE_LINKER_FLAGS "-pie -fPIE ${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}"
CACHE STRING "executable linker flags")
MESSAGE(STATUS "Android: Targeting API '${CMAKE_SYSTEM_VERSION}' "
"with architecture '${ANDROID_ARM_MODE_NAME}', "
"ABI '${ANDROID_ABI}', and processor '${CMAKE_SYSTEM_PROCESSOR}'")
MESSAGE(STATUS "System CMAKE_C_FLAGS: " ${CMAKE_C_FLAGS})
MESSAGE(STATUS "System CMAKE_CXX_FLAGS: " ${CMAKE_CXX_FLAGS})
ELSE()
IF(ANDROID_STANDALONE_TOOLCHAIN)
SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
ENDIF()
SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
IF(ANDROID_ABI STREQUAL "armeabi-v7a")
SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
ENDIF()
ENDIF()
ENDIF()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# find host C compiler
IF(HOST_C_COMPILER)
SET(HOST_C_COMPILER_NAME ${HOST_C_COMPILER})
ELSEIF(NOT $ENV{CC} STREQUAL "")
SET(HOST_C_COMPILER_NAME $ENV{CC})
ELSE()
SET(HOST_C_COMPILER_NAME cc)
ENDIF()
GET_FILENAME_COMPONENT(HOST_C_COMPILER_PATH ${HOST_C_COMPILER_NAME} PROGRAM)
IF(NOT HOST_C_COMPILER_PATH OR NOT EXISTS ${HOST_C_COMPILER_PATH})
MESSAGE(FATAL_ERROR "Cannot find host C compiler, set host C compiler:\n"
"\tcmake .. -DHOST_C_COMPILER=...")
ENDIF()
# find host CXX compiler
IF(HOST_CXX_COMPILER)
SET(HOST_CXX_COMPILER_NAME ${HOST_CXX_COMPILER})
ELSEIF(NOT $ENV{CXX} STREQUAL "")
SET(HOST_CXX_COMPILER_NAME $ENV{CXX})
ELSE()
SET(HOST_CXX_COMPILER_NAME c++)
ENDIF()
GET_FILENAME_COMPONENT(HOST_CXX_COMPILER_PATH ${HOST_CXX_COMPILER_NAME} PROGRAM)
IF(NOT HOST_CXX_COMPILER_PATH OR NOT EXISTS ${HOST_CXX_COMPILER_PATH})
MESSAGE(FATAL_ERROR "Cannot find host CXX compiler, set host CXX compiler:\n"
"\tcmake .. -DHOST_CXX_COMPILER=...")
ENDIF()
SET(HOST_C_COMPILER ${HOST_C_COMPILER_PATH} CACHE PATH "Host C compiler")
SET(HOST_CXX_COMPILER ${HOST_CXX_COMPILER_PATH} CACHE PATH "Host CXX compiler")
MESSAGE(STATUS "Found host C compiler: " ${HOST_C_COMPILER})
MESSAGE(STATUS "Found host CXX compiler: " ${HOST_CXX_COMPILER})
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This is a toolchain file for cross-compiling for iOS, and the
# configuration largely refers to public toolchain file:
# https://raw.githubusercontent.com/leetal/ios-cmake/master/ios.toolchain.cmake
# and
# https://github.com/cristeab/ios-cmake
#
# Supports options:
# IOS_PLATFORM = OS (default) or SIMULATOR
# This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
# OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
# SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
# IOS_ARCH
# The archectures wanted to support, such "arm64", "armv7;arm64"
# IOS_DEPLOYMENT_TARGET
# The minimum iOS deployment version, such as "7.0"
# IOS_ENABLE_BITCODE = ON (default) or OFF
# IOS_USE_VECLIB_FOR_BLAS = OFF (default) or ON
# IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
# By default this location is automatcially chosen based on the IOS_PLATFORM value above.
# If set manually, it will override the default location and force the user of a particular Developer Platform
# IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
# By default this location is automatcially chosen based on the IOS_DEVELOPER_ROOT value.
# In this case it will always be the most up-to-date SDK found in the IOS_DEVELOPER_ROOT path.
# If set manually, this will force the use of a specific SDK version
# Macros:
# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
# A convenience macro for setting xcode specific properties on targets
# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
# find_host_package (PROGRAM ARGS)
# A macro used to find executable programs on the host system, not within the iOS environment.
# Thanks to the android-cmake project for providing the command
if(NOT IOS)
return()
endif()
set(CMAKE_SYSTEM_NAME Darwin)
# Get the Xcode version being used.
execute_process(COMMAND xcodebuild -version
OUTPUT_VARIABLE XCODE_VERSION
RESULT_VARIABLE XCODE_VERSION_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT ${XCODE_VERSION_RESULT})
string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
else()
message(FATAL_ERROR "Cannot execute xcodebuild, please check whether xcode is installed.")
endif()
# Required as of cmake 2.8.10
set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
# Setup iOS platform unless specified manually with IOS_PLATFORM
if(NOT DEFINED IOS_PLATFORM)
set(IOS_PLATFORM "OS")
endif()
set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
# Set the architecture for iOS
if(NOT DEFINED IOS_ARCH)
if(IOS_PLATFORM STREQUAL "OS")
set(IOS_ARCH "armv7;armv7s;arm64")
elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
set(IOS_ARCH "i386;x86_64")
endif()
endif()
set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
# Specify minimum iOS deployment version
if(NOT DEFINED IOS_DEPLOYMENT_TARGET)
set(IOS_DEPLOYMENT_TARGET "7.0")
endif()
set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version")
# Whether to enable bitcode
if(NOT DEFINED IOS_ENABLE_BITCODE)
set(IOS_ENABLE_BITCODE ON)
endif()
set(IOS_ENABLE_BITCODE ${IOS_ENABLE_BITCODE} CACHE BOOL "Whether to enable bitcode")
if(NOT DEFINED IOS_USE_VECLIB_FOR_BLAS)
set(IOS_USE_VECLIB_FOR_BLAS OFF)
endif()
set(IOS_USE_VECLIB_FOR_BLAS ${IOS_UES_VECLIB_FOR_BLAS} CACHE BOOL "Whether to use veclib")
# Check the platform selection and setup for developer root
if(${IOS_PLATFORM} STREQUAL "OS")
set(IOS_PLATFORM_LOCATION "iPhoneOS.platform")
set(XCODE_IOS_PLATFORM iphoneos)
# This causes the installers to properly locate the output libraries
set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
elseif(${IOS_PLATFORM} STREQUAL "SIMULATOR")
set(IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
set(XCODE_IOS_PLATFORM iphonesimulator)
# This causes the installers to properly locate the output libraries
set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
elseif(${IOS_PLATFORM} STREQUAL "WATCHOS")
set(IOS_PLATFORM_LOCATION "WatchOS.platform")
set(XCODE_IOS_PLATFORM watchos)
# This causes the installers to properly locate the output libraries
set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-watchos")
else(${IOS_PLATFORM} STREQUAL "OS")
message(FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please set to\n"
"\t OS, SIMULATOR, or WATCHOS.")
endif()
# Check iOS developer toolchain
if(NOT DEFINED IOS_DEVELOPER_ROOT)
# Setup iOS developer location
execute_process(COMMAND xcode-select -print-path
OUTPUT_VARIABLE XCODE_DEVELOPER_DIR
RESULT_VARIABLE XCODE_DEVELOPER_DIR_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
# Xcode 4.3 changed the installation location, choose the most recent one available
if(${XCODE_VERSION} VERSION_LESS "4.3.0")
set(IOS_DEVELOPER_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
else()
set(IOS_DEVELOPER_ROOT "${XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
endif()
endif()
if(EXISTS ${IOS_DEVELOPER_ROOT})
set(IOS_DEVELOPER_ROOT ${IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
else()
message(FATAL_ERROR "Invalid IOS_DEVELOPER_ROOT: ${IOS_DEVELOPER_ROOT} does not exist.")
endif()
# Check iOS SDK
if(NOT DEFINED IOS_SDK_ROOT)
# Find and use the most recent iOS sdk
file(GLOB IOS_SDK_LISTS "${IOS_DEVELOPER_ROOT}/SDKs/*")
if(IOS_SDK_LISTS)
list(SORT IOS_SDK_LISTS)
list(REVERSE IOS_SDK_LISTS)
list(GET IOS_SDK_LISTS 0 IOS_SDK_ROOT)
else(IOS_SDK_LISTS)
message(FATAL_ERROR "No iOS SDK's found in default search path ${IOS_DEVELOPER_ROOT}."
" Please manually set IOS_SDK_ROOT or install the iOS SDK.")
endif(IOS_SDK_LISTS)
endif()
if(EXISTS ${IOS_SDK_ROOT})
set(IOS_SDK_ROOT ${IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
message(STATUS "iOS toolchain: ${IOS_SDK_ROOT}")
else()
message(FATAL_ERROR "Invalid IOS_SDK_ROOT: ${IOS_SDK_ROOT} does not exist.")
endif()
# Set the sysroot default to the most recent SDK
set(CMAKE_OSX_SYSROOT ${IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
# Get version of iOS SDK
execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
OUTPUT_VARIABLE IOS_SDK_VERSION
RESULT_VARIABLE IOS_SDK_VERSION_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(${IOS_SDK_VERSION_RESULT})
string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" IOS_SDK_VERSION "${IOS_SDK_ROOT}")
endif()
if(NOT IOS_SDK_VERSION)
message(WARNING "Cannot get SDK's version.")
set(IOS_SDK_VERSION 1)
endif()
set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION})
# Find the C & C++ compilers for the specified SDK.
if(NOT CMAKE_C_COMPILER)
# Default to use clang
execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
OUTPUT_VARIABLE IOS_C_COMPILER
RESULT_VARIABLE IOS_C_COMPILER_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(${IOS_C_COMPILER_RESULT})
get_filename_component(IOS_C_COMPILER clang PROGRAM)
endif()
else(NOT CMAKE_C_COMPILER)
# User can set it in cmake command
get_filename_component(IOS_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
endif(NOT CMAKE_C_COMPILER)
if(NOT EXISTS ${IOS_C_COMPILER})
message(FATAL_ERROR "Cannot find C compiler: ${IOS_C_COMPILER}")
endif()
if(NOT CMAKE_CXX_COMPILER)
# Default to use clang++
execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
OUTPUT_VARIABLE IOS_CXX_COMPILER
RESULT_VARIABLE IOS_CXX_COMPILER_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(${IOS_CXX_COMPILER_RESULT})
get_filename_component(IOS_CXX_COMPILER clang++ PROGRAM)
endif()
else(NOT CMAKE_CXX_COMPILER)
# User can set it in cmake command
get_filename_component(IOS_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
endif(NOT CMAKE_CXX_COMPILER)
if(NOT EXISTS ${IOS_CXX_COMPILER})
message(FATAL_ERROR "Cannot find CXX compiler: ${IOS_CXX_COMPILER}")
endif()
set(CMAKE_C_COMPILER ${IOS_C_COMPILER} CACHE PATH "C compiler" FORCE)
set(CMAKE_CXX_COMPILER ${IOS_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
# Set iOS specific C/C++ flags
if(IOS_PLATFORM STREQUAL "OS")
if(XCODE_VERSION VERSION_LESS "7.0")
set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
else()
# Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM.
set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
endif()
else()
set(XCODE_IOS_FLATFORM_VERSION_FLAGS "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
endif()
if(IOS_ENABLE_BITCODE)
set(XCODE_IOS_BITCODE_FLAGS "${IOS_COMPILER_FLAGS} -fembed-bitcode")
else()
set(XCODE_IOS_BITCODE_FLAGS "")
endif()
set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_FLAGS}")
# Hidden visibilty is required for cxx on iOS
set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
if(IOS_USE_VECLIB_FOR_BLAS)
# Find vecLib for iOS
set(VECLIB_SEARCH_DIRS
${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks
${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Frameworks
)
find_path(VECLIB_INC_DIR vecLib.h PATHS ${VECLIB_SEARCH_DIRS}/vecLib.framework/Headers)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(vecLib DEFAULT_MSG VECLIB_INC_DIR)
if(VECLIB_FOUND)
if(VECLIB_INC_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework vecLib")
message(STATUS "Found standalone vecLib.framework")
else()
set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework Accelerate")
message(STATUS "Found vecLib as part of Accelerate.framework")
endif()
endif()
endif()
set(CMAKE_C_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_C_LINK_FLAGS}")
set(CMAKE_CXX_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_CXX_LINK_FLAGS}")
set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
if(NOT IOS_ENABLE_BITCODE)
set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
else()
set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib")
set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle")
endif()
set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
endif()
# Set the find root to the iOS developer roots and to user defined paths
set(CMAKE_FIND_ROOT_PATH ${IOS_DEVELOPER_ROOT} ${IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH}
CACHE string "iOS find search path root")
# default to searching for frameworks first
set(CMAKE_FIND_FRAMEWORK FIRST)
# set up the default search directories for frameworks
set(CMAKE_SYSTEM_FRAMEWORK_PATH
${IOS_SDK_ROOT}/System/Library/Frameworks
${IOS_SDK_ROOT}/System/Library/PrivateFrameworks
${IOS_SDK_ROOT}/Developer/Library/Frameworks
)
# only search the iOS sdks, not the remainder of the host filesystem
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
message(STATUS "iOS: Targeting iOS '${CMAKE_SYSTEM_VERSION}', "
"building for '${IOS_PLATFORM}' platform, with architecture '${CMAKE_OSX_ARCHITECTURES}'")
message(STATUS "System CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
message(STATUS "System CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
# Used in ExternalProject command
string(REPLACE ";" "\\$<SEMICOLON>" EXTERNAL_IOS_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
set(EXTERNAL_OPTIONAL_ARGS
-DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT}
-DCMAKE_OSX_ARCHITECTURES=${EXTERNAL_IOS_ARCHITECTURES})
# This little macro lets you set any XCode specific property
macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
endmacro(set_xcode_property)
# This macro lets you find executable programs on the host system
macro(find_host_package)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
set(IOS FALSE)
find_package(${ARGN})
set(IOS TRUE)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
endmacro(find_host_package)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This is a toolchain file for cross-compiling for Raspberry Pi.
# The supported variables are listed belows:
#
# RPI_TOOLCHAIN
# RPI_ARM_NEON
#
# Also you can set CMAKE_C/CXX_COMPILER yourself, through cmake arguments.
IF(NOT RPI)
return()
ENDIF()
SET(CMAKE_SYSTEM_NAME Linux)
SET(CMAKE_SYSTEM_VERSION 1)
SET(CMAKE_SYSTEM_PROCESSOR arm)
# check the exist of raspberry pi toolchain
IF(NOT DEFINED RPI_TOOLCHAIN)
SET(RPI_TOOLCHAIN $ENV{RPI_TOOLCHAIN}
CACHE PATH "Folder holds the toolchain of Raspberr Pi")
ENDIF()
IF(NOT RPI_TOOLCHAIN)
MESSAGE(WARNING "It is recommended to set RPI_TOOLCHAIN to use toolchain.\n"
"To cross-compile for Raspberry Pi, you need to download the tools using:\n"
" git clone https://github.com/raspberrypi/tools\n")
ENDIF()
IF(NOT DEFINED RPI_ARM_NEON)
SET(RPI_ARM_NEON ON)
ENDIF()
IF(RPI_TOOLCHAIN)
SET(RPI_TOOLCHAIN_ROOT ${RPI_TOOLCHAIN})
IF(RPI_TOOLCHAIN_ROOT MATCHES "gcc-linaro-arm-linux-gnueabihf-raspbian(-x64)?$")
# gcc-linaro-arm-linux-gnueabihf-raspbian
# gcc-linaro-arm-linux-gnueabihf-raspbian-x64
SET(RPI_TOOLCHAIN_NAME arm-linux-gnueabihf)
ENDIF()
SET(RPI_TOOLCHAIN_PREFIX "${RPI_TOOLCHAIN_ROOT}/bin/${RPI_TOOLCHAIN_NAME}-")
ENDIF()
# C compiler
IF(NOT CMAKE_C_COMPILER)
SET(RPI_C_COMPILER "${RPI_TOOLCHAIN_PREFIX}gcc")
ELSE()
GET_FILENAME_COMPONENT(RPI_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
ENDIF()
IF(NOT EXISTS ${RPI_C_COMPILER})
MESSAGE(FATAL_ERROR "Cannot find C compiler: ${RPI_C_COMPILER}")
ENDIF()
# CXX compiler
IF(NOT CMAKE_CXX_COMPILER)
SET(RPI_CXX_COMPILER "${RPI_TOOLCHAIN_PREFIX}g++")
ELSE()
GET_FILENAME_COMPONENT(RPI_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
ENDIF()
IF(NOT EXISTS ${RPI_CXX_COMPILER})
MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${RPI_CXX_COMPILER}")
ENDIF()
SET(CMAKE_C_COMPILER ${RPI_C_COMPILER} CACHE PATH "C compiler" FORCE)
SET(CMAKE_CXX_COMPILER ${RPI_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
IF(RPI_ARM_NEON)
SET(RPI_C_FLAGS "${RPI_C_FLAGS} -mfpu=neon")
ENDIF()
SET(CMAKE_C_FLAGS "${RPI_C_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
SET(CMAKE_CXX_FLAGS "${RPI_C_FLAGS} ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
......@@ -63,9 +63,7 @@ function(select_nvcc_arch_flags out_variable)
# List of arch names
set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
set(archs_name_default "All")
if(NOT CMAKE_CROSSCOMPILING)
list(APPEND archs_names "Auto")
endif()
list(APPEND archs_names "Auto")
# set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
......
......@@ -13,7 +13,7 @@
# limitations under the License.
#
IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
IF(NOT WITH_DISTRIBUTE)
return()
ENDIF()
......
......@@ -71,13 +71,3 @@ if (WIN32)
set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
endif(HAVE_SHLWAPI)
endif (WIN32)
IF(WITH_C_API)
INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
IF(ANDROID)
INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
ELSE()
INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib)
ENDIF()
ENDIF()
......@@ -26,14 +26,8 @@ ENDIF(WIN32)
INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
# Using the unofficial glog for Android API < 21
SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git")
SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8")
ELSE()
SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
SET(GLOG_TAG "v0.3.5")
ENDIF()
SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
SET(GLOG_TAG "v0.3.5")
ExternalProject_Add(
extern_glog
......@@ -78,12 +72,3 @@ ADD_DEPENDENCIES(glog extern_glog gflags)
LINK_LIBRARIES(glog gflags)
LIST(APPEND external_project_dependencies glog)
IF(WITH_C_API)
INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
IF(ANDROID)
INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
ELSE()
INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib)
ENDIF()
ENDIF()
......@@ -13,7 +13,7 @@
# limitations under the License.
#
IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
IF(NOT WITH_DISTRIBUTE)
return()
ENDIF()
......
......@@ -13,10 +13,6 @@
# limitations under the License.
#
IF(MOBILE_INFERENCE)
return()
ENDIF()
include (ExternalProject)
# NOTE: gzstream is needed when linking with ctr reader.
......
......@@ -19,8 +19,8 @@ IF(NOT WITH_LIBXSMM)
return()
ENDIF()
IF(WIN32 OR APPLE OR ANDROID OR IOS)
MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.")
IF(WIN32 OR APPLE)
MESSAGE(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet.")
SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
return()
ENDIF()
......
......@@ -110,7 +110,3 @@ else(WIN32)
endif(WIN32)
ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
IF(WITH_C_API)
INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
ENDIF()
......@@ -74,7 +74,3 @@ ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
LIST(APPEND external_project_dependencies mklml)
IF(WITH_C_API)
INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
ENDIF()
# Find the NNPACK library
# NNPACK_ROOT - where to find NNPACK include and library.
#
set(NNPACK_FOUND OFF)
set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
set(NNPACK_FOUND ON)
INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
set(NNPACK_LIBS)
list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
if (NNPACK_UKERNELS_LIB)
list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
endif()
if (NNPACK_CPUFEATURES_LIB)
list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
endif()
if(NOT ANDROID)
list(APPEND NNPACK_LIBS "rt")
endif()
else()
message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
endif()
......@@ -40,38 +40,12 @@ IF(NOT ${CBLAS_FOUND})
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
SET(OPENBLAS_COMMIT "v0.2.20")
IF(CMAKE_CROSSCOMPILING)
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
IF(ANDROID)
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
# use softfp
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF()
ELSEIF(IOS)
IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
ELSE()
MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
"You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
ENDIF()
ELSEIF(RPI)
# use hardfp
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
ENDIF()
ELSE()
IF(APPLE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
ENDIF()
SET(OPTIONAL_ARGS "")
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
ENDIF()
IF(APPLE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
ENDIF()
SET(OPTIONAL_ARGS "")
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
ENDIF()
SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
......@@ -92,25 +66,6 @@ IF(NOT ${CBLAS_FOUND})
ELSE()
ENDIF(NOT WIN32)
SET(CBLAS_PROVIDER openblas)
IF(WITH_C_API)
INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
# Because libopenblas.a is a symbolic link of another library, thus need to
# install the whole directory.
IF(ANDROID)
SET(TMP_INSTALL_DIR third_party/openblas/lib/${ANDROID_ABI})
ELSE()
SET(TMP_INSTALL_DIR third_party/openblas/lib)
ENDIF()
INSTALL(CODE "execute_process(
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib
${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}
)"
)
INSTALL(CODE "MESSAGE(STATUS \"Installing: \"
\"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
)"
)
ENDIF()
ENDIF(NOT ${CBLAS_FOUND})
MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
......
......@@ -204,15 +204,6 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
IF(MOBILE_INFERENCE)
# The reason why the official version is not used is described in
# https://github.com/PaddlePaddle/Paddle/issues/6114
SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git")
SET(PROTOBUF_TAG "v3.2.0")
IF(NOT BUILD_FOR_HOST)
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF")
ENDIF()
ENDIF()
ExternalProject_Add(
${TARGET_NAME}
......@@ -240,19 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
)
ENDFUNCTION()
IF(NOT MOBILE_INFERENCE)
SET(PROTOBUF_VERSION 3.1)
ELSE()
SET(PROTOBUF_VERSION 3.2)
ENDIF()
IF(CMAKE_CROSSCOMPILING)
build_protobuf(protobuf_host TRUE)
LIST(APPEND external_project_dependencies protobuf_host)
SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_host_PROTOC_EXECUTABLE}
CACHE FILEPATH "protobuf executable." FORCE)
ENDIF()
SET(PROTOBUF_VERSION 3.1)
IF(NOT PROTOBUF_FOUND)
build_protobuf(extern_protobuf FALSE)
......@@ -266,20 +245,7 @@ IF(NOT PROTOBUF_FOUND)
SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
CACHE FILEPATH "protoc library." FORCE)
IF(WITH_C_API)
INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
IF(ANDROID)
INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
ELSE()
INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
ENDIF()
ENDIF()
IF(CMAKE_CROSSCOMPILING)
PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf)
ELSE()
SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
CACHE FILEPATH "protobuf executable." FORCE)
PROMPT_PROTOBUF_LIB(extern_protobuf)
ENDIF()
SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
CACHE FILEPATH "protobuf executable." FORCE)
PROMPT_PROTOBUF_LIB(extern_protobuf)
ENDIF(NOT PROTOBUF_FOUND)
......@@ -71,7 +71,3 @@ ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
LIST(APPEND external_project_dependencies pslib)
IF(WITH_C_API)
INSTALL(FILES ${PSLIB_LIB} ${PSLIB_IOMP_LIB} DESTINATION lib)
ENDIF()
......@@ -71,7 +71,3 @@ ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
LIST(APPEND external_project_dependencies pslib_brpc)
IF(WITH_C_API)
INSTALL(FILES ${PSLIB_BRPC_LIB} ${PSLIB_BRPC_IOMP_LIB} DESTINATION lib)
ENDIF()
......@@ -12,10 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
if(MOBILE_INFERENCE OR RPI)
return()
endif()
include (ExternalProject)
# NOTE: snappy is needed when linking with recordio
......
......@@ -12,10 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
IF(MOBILE_INFERENCE OR RPI)
return()
ENDIF()
include (ExternalProject)
set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT WITH_SWIG_PY)
return()
ENDIF()
FIND_PACKAGE(SWIG)
IF(NOT SWIG_FOUND)
# build swig as an external project
INCLUDE(ExternalProject)
SET(SWIG_SOURCES_DIR ${THIRD_PARTY_PATH}/swig)
SET(SWIG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/swig)
SET(SWIG_TARGET_VERSION "3.0.2")
SET(SWIG_DOWNLOAD_SRC_MD5 "62f9b0d010cef36a13a010dc530d0d41")
SET(SWIG_DOWNLOAD_WIN_MD5 "3f18de4fc09ab9abb0d3be37c11fbc8f")
IF(WIN32)
# swig.exe available as pre-built binary on Windows:
ExternalProject_Add(swig
URL http://prdownloads.sourceforge.net/swig/swigwin-${SWIG_TARGET_VERSION}.zip
URL_MD5 ${SWIG_DOWNLOAD_WIN_MD5}
SOURCE_DIR ${SWIG_SOURCES_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
UPDATE_COMMAND ""
)
SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe CACHE FILEPATH "SWIG Executable" FORCE)
ELSE(WIN32)
# swig uses bison find it by cmake and pass it down
FIND_PACKAGE(BISON)
# From SWIG configure
ExternalProject_Add(swig
GIT_REPOSITORY https://github.com/swig/swig.git
GIT_TAG rel-3.0.10
PREFIX ${SWIG_SOURCES_DIR}
CONFIGURE_COMMAND cd <SOURCE_DIR> && ./autogen.sh && ./configure
--prefix=${SWIG_INSTALL_DIR} --without-pcre
BUILD_COMMAND cd <SOURCE_DIR> && make
INSTALL_COMMAND cd <SOURCE_DIR> && make install
UPDATE_COMMAND ""
)
SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})
SET(SWIG_EXECUTABLE ${SWIG_INSTALL_DIR}/bin/swig)
ENDIF(WIN32)
LIST(APPEND external_project_dependencies swig)
ENDIF(NOT SWIG_FOUND)
......@@ -12,10 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
IF(MOBILE_INFERENCE)
return()
ENDIF()
INCLUDE(ExternalProject)
SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
......
......@@ -73,12 +73,3 @@ include_directories(${XXHASH_INCLUDE_DIR})
add_dependencies(xxhash extern_xxhash)
LIST(APPEND external_project_dependencies xxhash)
IF(WITH_C_API)
INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash)
IF(ANDROID)
INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI})
ELSE()
INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib)
ENDIF()
ENDIF()
......@@ -59,12 +59,3 @@ SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
ADD_DEPENDENCIES(zlib extern_zlib)
LIST(APPEND external_project_dependencies zlib)
IF(WITH_C_API)
INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
IF(ANDROID)
INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib/${ANDROID_ABI})
ELSE()
INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib)
ENDIF()
ENDIF()
......@@ -156,10 +156,8 @@ set(GPU_COMMON_FLAGS
endif(NOT WIN32)
if (APPLE)
if(NOT CMAKE_CROSSCOMPILING)
# On Mac OS X build fat binaries with x86_64 architectures by default.
set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
endif()
# On Mac OS X build fat binaries with x86_64 architectures by default.
set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
# On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
set (COMMON_FLAGS -Wno-deprecated-register)
endif(APPLE)
......
......@@ -90,11 +90,11 @@
# including binary directory for generated headers.
include_directories(${CMAKE_CURRENT_BINARY_DIR})
if(NOT APPLE AND NOT ANDROID)
if(NOT APPLE)
find_package(Threads REQUIRED)
link_libraries(${CMAKE_THREAD_LIBS_INIT})
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
endif(NOT APPLE AND NOT ANDROID)
endif(NOT APPLE)
set_property(GLOBAL PROPERTY FLUID_MODULES "")
# find all fluid modules is used for paddle fluid static library
......@@ -388,6 +388,7 @@ function(cc_test TARGET_NAME)
endif()
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
# No unit test should exceed 10 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
......@@ -460,6 +461,7 @@ function(nv_test TARGET_NAME)
endif()
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
endif()
endfunction(nv_test)
......@@ -655,12 +657,6 @@ function(paddle_protobuf_generate_cpp SRCS HDRS)
set(${SRCS})
set(${HDRS})
if (MOBILE_INFERENCE)
set(EXTRA_FLAG "lite:")
else()
set(EXTRA_FLAG "")
endif()
foreach(FIL ${ARGN})
get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
get_filename_component(FIL_WE ${FIL} NAME_WE)
......@@ -677,7 +673,7 @@ function(paddle_protobuf_generate_cpp SRCS HDRS)
COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-I${CMAKE_CURRENT_SOURCE_DIR}
--cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
--cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
DEPENDS ${ABS_FIL} protoc
COMMENT "Running C++ protocol buffer compiler on ${FIL}"
VERBATIM )
......@@ -714,9 +710,10 @@ function(py_test TARGET_NAME)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
FLAGS_cpu_deterministic=true
FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G
PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
......
......@@ -149,25 +149,23 @@ if (WITH_NGRAPH)
)
endif ()
if (NOT MOBILE_INFERENCE AND NOT RPI)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
copy(snappy_lib
SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappy)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
copy(snappy_lib
SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappy)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
copy(snappystream_lib
SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappystream)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
copy(snappystream_lib
SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS snappystream)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
copy(zlib_lib
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS zlib)
endif ()
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
copy(zlib_lib
SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
DEPS zlib)
# paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
......
......@@ -74,21 +74,6 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
# configuration for cross-compiling
IF(DEFINED CMAKE_SYSTEM_NAME)
INCLUDE(cross_compiling/host)
IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
SET(ANDROID TRUE)
INCLUDE(cross_compiling/android)
ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi")
SET(RPI TRUE)
INCLUDE(cross_compiling/raspberry_pi)
ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
SET(IOS TRUE)
INCLUDE(cross_compiling/ios)
ENDIF()
ENDIF()
# external dependencies log output
SET(EXTERNAL_PROJECT_LOG_ARGS
LOG_DOWNLOAD 0 # Wrap download in script to log output
......
......@@ -53,118 +53,3 @@ function(target_circle_link_libraries TARGET_NAME)
"-Wl,--end-group")
endif()
endfunction()
# compile_cu_as_cpp
# Make a cu file compiled as C++
# Arguments: Source files
macro(compile_cu_as_cpp)
foreach(s ${ARGN})
set_source_files_properties(${s} PROPERTIES LANGUAGE CXX)
set_source_files_properties(${s} PROPERTIES COMPILE_FLAGS "-x c++")
endforeach()
endmacro()
# link_paddle_exe
# add paddle library for a paddle executable, such as trainer, pserver.
#
# It will handle WITH_PYTHON etc.
function(link_paddle_exe TARGET_NAME)
if(WITH_RDMA)
generate_rdma_links()
endif()
if(MOBILE_INFERENCE)
target_circle_link_libraries(${TARGET_NAME}
ARCHIVE_START
paddle_gserver
paddle_function
ARCHIVE_END
paddle_math
paddle_utils
paddle_parameter
paddle_proto
paddle_cuda
${EXTERNAL_LIBS}
${CMAKE_THREAD_LIBS_INIT}
${CMAKE_DL_LIBS}
${RDMA_LD_FLAGS}
${RDMA_LIBS})
else()
target_circle_link_libraries(${TARGET_NAME}
ARCHIVE_START
paddle_gserver
paddle_function
ARCHIVE_END
paddle_pserver
paddle_trainer_lib
paddle_network
paddle_math
paddle_utils
paddle_parameter
paddle_proto
paddle_cuda
paddle_optimizer
${EXTERNAL_LIBS}
${CMAKE_THREAD_LIBS_INIT}
${CMAKE_DL_LIBS}
${RDMA_LD_FLAGS}
${RDMA_LIBS})
endif()
if(ANDROID)
target_link_libraries(${TARGET_NAME} log)
endif(ANDROID)
if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
endif()
add_dependencies(${TARGET_NAME} ${external_project_dependencies})
endfunction()
# link_paddle_test
# Link a paddle unittest for target
# TARGET_NAME: the unittest target name
# Rest Arguemnts: not used.
function(link_paddle_test TARGET_NAME)
link_paddle_exe(${TARGET_NAME})
target_link_libraries(${TARGET_NAME}
paddle_test_main
paddle_test_util
${GTEST_LIBRARIES})
endfunction()
# add_unittest_without_exec
#
# create a paddle unittest. not specifically define how to run this unittest.
# TARGET_NAME: the unittest target name, same as executable file name
# Rest Arguments: the source files to compile this unittest.
macro(add_unittest_without_exec TARGET_NAME)
add_executable(${TARGET_NAME} ${ARGN})
link_paddle_test(${TARGET_NAME})
endmacro()
# add_unittest
# create a paddle unittest and just to execute this binary to make unittest.
#
# TARGET_NAME: the unittest target name, same as executable file name
# Rest Arguments: the source files to compile this unittest.
macro(add_unittest TARGET_NAME)
add_unittest_without_exec(${TARGET_NAME} ${ARGN})
add_test(${TARGET_NAME} ${TARGET_NAME})
endmacro()
# add_simple_unittest
# create a paddle unittest with file name. It just compile ${TARGET_NAME}.cpp to
# ${TARGET_NAME} and then execute it.
macro(add_simple_unittest TARGET_NAME)
add_unittest(${TARGET_NAME} ${TARGET_NAME}.cpp)
endmacro()
# Creates C resources file from files in given resource file
function(create_resources res_file output_file)
add_custom_command(
OUTPUT ${output_file}
COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
endfunction()
......@@ -45,6 +45,7 @@ paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], vararg
paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None)
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None
......@@ -66,6 +67,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var
paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
......@@ -120,7 +122,7 @@ paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None,
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name'], varargs=None, keywords=None, defaults=(0, True, None))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
......@@ -196,7 +198,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
......@@ -211,6 +213,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.shuffle_channel ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
......@@ -317,6 +320,7 @@ paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'asp
paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,))
paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
......@@ -356,6 +360,7 @@ paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_b
paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.contrib.reader.ctr_reader.ctr_reader ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.build_compressor ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.__init__ ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None))
paddle.fluid.contrib.CompressPass.add_strategy ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None)
......
#windows treat symbolic file as a real file, which is different with unix
#We create a hidden file and compile it instead of origin source file.
function(windows_symbolic TARGET)
......@@ -129,12 +128,6 @@ cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
if(WITH_NGRAPH)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
endif(WITH_NGRAPH)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
......@@ -171,13 +164,12 @@ if(WITH_DISTRIBUTE)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else()
if(WITH_NGRAPH)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
else(WITH_NGRAPH)
if (WITH_NGRAPH)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine)
else ()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
endif(WITH_NGRAPH)
endif()
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif()
......@@ -214,3 +206,24 @@ endif (NOT WIN32)
cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack)
cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog)
# Get the current working branch
execute_process(
COMMAND git rev-parse --abbrev-ref HEAD
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_BRANCH
OUTPUT_STRIP_TRAILING_WHITESPACE
)
# Get the latest abbreviated commit hash of the working branch
execute_process(
COMMAND git log -1 --format=%h
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
message(STATUS "commit: ${PADDLE_COMMIT}")
message(STATUS "branch: ${PADDLE_BRANCH}")
configure_file(commit.h.in commit.h)
#pragma once
#include <string>
namespace paddle {
namespace framework {
static std::string paddle_commit() {
return "@PADDLE_COMMIT@";
}
static std::string paddle_compile_branch() {
return "@PADDLE_BRANCH@";
}
static std::string paddle_version() {
return "@PADDLE_VERSION@";
}
} // namespace framework
} // namespace paddle
......@@ -24,6 +24,7 @@ limitations under the License. */
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace paddle {
......@@ -243,3 +244,4 @@ USE_PASS(sequential_execution_pass);
USE_PASS(all_reduce_deps_pass);
USE_PASS(modify_op_lock_and_record_event_pass);
USE_PASS(lock_free_optimize_pass);
USE_PASS(graph_to_program_pass);
......@@ -91,7 +91,7 @@ struct BuildStrategy {
int num_trainers_{1};
int trainer_id_{0};
std::vector<std::string> trainers_endpoints_;
bool remove_unnecessary_lock_{false};
bool remove_unnecessary_lock_{true};
// NOTE:
// Before you add new options, think if it's a general strategy that works
......
......@@ -25,6 +25,9 @@ struct ExecutionStrategy {
size_t num_threads_{0};
bool use_cuda_{true};
bool allow_op_delay_{false};
// If we set this to 1, we will delete all variables when finish a batch. and
// this will loss 15%+ performance.
// Please be aware about this parameters.
size_t num_iteration_per_drop_scope_{1};
ExecutorType type_{kDefault};
bool dry_run_{false};
......
......@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_NGRAPH
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/operators/ngraph/ngraph_engine.h"
#endif
DECLARE_bool(benchmark);
......@@ -133,24 +133,6 @@ static void DeleteUnusedTensors(
}
}
static void EnableFusedOp(ExecutorPrepareContext* ctx) {
#ifdef PADDLE_WITH_NGRAPH
VLOG(3) << "use_ngraph=True";
auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_);
for (auto& interval : intervals) {
auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0),
interval.at(1));
*interval[0] = std::unique_ptr<OperatorBase>(ng_op);
}
for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
ctx->ops_.erase(it->at(0) + 1, it->at(1));
}
#else
LOG(WARNING)
<< "'NGRAPH' is not supported, Please re-compile with WITH_NGRAPH option";
#endif
}
Executor::Executor(const platform::Place& place) : place_(place) {}
void Executor::Close() {
......@@ -204,6 +186,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) {
platform::RecordBlock b(block_id);
if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
#ifdef PADDLE_WITH_NGRAPH
if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
#endif
auto ctx = Prepare(pdesc, block_id);
RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
}
......@@ -379,7 +364,6 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
}
if (FLAGS_use_ngraph) EnableFusedOp(ctx.get());
return ctx;
}
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......
......@@ -18,8 +18,10 @@ limitations under the License. */
#include <fstream>
#include <iosfwd>
#include <ostream>
#include <stack>
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/ir/graph_traits.h"
DEFINE_string(print_sub_graph_dir, "",
"FLAGS_print_sub_graph_dir is used "
......@@ -41,7 +43,7 @@ void SortHelper(
}
}
VLOG(3) << "topology sort insert: " << node->Name()
VLOG(5) << "topology sort insert: " << node->Name() << " "
<< reinterpret_cast<void *>(node) << " input " << node->inputs.size();
ret->push_back(node);
}
......@@ -99,12 +101,13 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
return ret;
}
// Build operator inlink edge table.
std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
const Graph &graph) {
std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
for (auto &n : graph.Nodes()) {
if (n->NodeType() != ir::Node::Type::kOperation) continue;
if (!n->IsOp()) continue;
if (adj_list.find(n) == adj_list.end()) {
adj_list[n] = std::unordered_set<ir::Node *>();
}
......@@ -121,6 +124,119 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
return adj_list;
}
// Build operator outlink edge table.
std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationOutAdjList(
const Graph &graph) {
std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
for (auto &n : graph.Nodes()) {
if (!n->IsOp()) continue;
if (adj_list.find(n) == adj_list.end()) {
adj_list[n] = std::unordered_set<ir::Node *>();
}
for (auto &var : n->outputs) {
for (auto &adj_n : var->outputs) {
PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
<< " -> " << n->Name() << reinterpret_cast<void *>(n)
<< " via " << var->Name() << reinterpret_cast<void *>(var);
adj_list[n].insert(adj_n);
}
}
}
return adj_list;
}
std::vector<ir::Node *> OpDFSSort(const Graph &graph) {
auto edge_table = BuildOperationOutAdjList(graph);
std::stack<Node *> stack;
for (auto &ele : edge_table) {
if (ele.first->inputs.empty()) {
// find the input ops (those without input vars)
stack.push(ele.first);
} else {
// find the ops with only persistable vars as inputs.
bool all_persistable = true;
for (auto *input : ele.first->inputs) {
if (!(input->IsVar() && input->Var() && input->Var()->Persistable())) {
all_persistable = false;
}
}
if (all_persistable) {
stack.push(ele.first);
}
}
}
std::vector<Node *> res;
// start from the feed op and DFS
std::unordered_set<Node *> unique_set;
while (!stack.empty()) {
// will start from the last feed by default.
auto cur = stack.top();
stack.pop();
unique_set.insert(cur);
res.push_back(cur);
for (auto *op : edge_table[cur]) {
if (!unique_set.count(op)) {
stack.push(op);
}
}
}
return res;
}
std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph) {
std::vector<ir::Node *> nodes;
std::unordered_map<Node *, int> in_degree;
auto set_out_ops_ready = [&](Node *var) {
for (auto *op : var->outputs) {
--in_degree[op];
}
};
// build in_degree
for (auto *node : graph.Nodes()) {
if (node->IsOp()) {
in_degree[node] += node->inputs.size();
} else if (node->IsVar() && node->inputs.empty()) {
// put all the inputs of the whole graph ready.
set_out_ops_ready(node);
}
}
std::deque<Node *> op_queue;
// first visit
for (auto &node : OpDFSSort(graph)) {
if (node->IsOp()) {
op_queue.push_back(node);
}
}
// traverse the graph
int num_ops = op_queue.size();
while (num_ops) {
for (auto it = op_queue.begin(); it != op_queue.end(); it++) {
auto *&cur_op = *it;
if (!cur_op || in_degree[cur_op] > 0) continue;
// visit this node
// put all the output var of this op valid.
for (auto *out_var : cur_op->outputs) {
if (!out_var) continue;
set_out_ops_ready(out_var);
}
VLOG(8) << "visit " << cur_op->Name();
nodes.push_back(cur_op);
cur_op = nullptr;
num_ops--;
}
}
return nodes;
}
size_t GraphNum(const Graph &graph) {
std::unordered_set<ir::Node *> nodes(graph.Nodes());
std::unordered_set<ir::Node *> visited_nodes;
......@@ -203,6 +319,29 @@ size_t GraphNum(const Graph &graph) {
return graph_count;
}
void CleanIndividualNodes(Graph *graph) {
std::unordered_set<Node *> nodes2rm;
for (auto *node : graph->Nodes()) {
if (node->inputs.empty() && node->outputs.empty()) {
nodes2rm.insert(node);
}
}
for (auto *node : nodes2rm) {
graph->RemoveNode(node);
}
}
std::vector<Node *> TopologyVarientSort(const Graph &graph,
SortKind sort_kind) {
switch (sort_kind) {
case SortKind::TS:
return framework::ir::TopologySortOperations(graph);
default:
return framework::ir::TopologyDfsSortOperations(graph);
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -34,6 +34,23 @@ size_t GraphNum(const Graph &graph);
// `graph` cannot contain circle.
std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
// Topological sort, but try to DFS.
std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph);
// Different kinds to sort the operators in a graph to a sequence.
enum class SortKind {
// Topological Search
TS = 0,
// Topological and Depth First Search
TDFS
};
// Several kinds of topological sort.
std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind);
// Clean the nodes that doesn't connect to others.
void CleanIndividualNodes(Graph *graph);
// Build an adjacency list of operations for the `graph`.
std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
const Graph &graph);
......
......@@ -20,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
......@@ -29,6 +28,14 @@ namespace ir {
std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
std::unique_ptr<Graph> graph) const {
// Remove the unneeded variables after memory optimization.
std::unordered_set<std::string> vars2remove;
if (graph->Has(kGraphToProgramVarsToRemove)) {
vars2remove = graph->Get<std::unordered_set<std::string>>(
kGraphToProgramVarsToRemove);
VLOG(2) << "graph to program remove " << vars2remove.size() << " nodes";
}
ProgramDesc& program = Get<ProgramDesc>("program");
std::unique_ptr<proto::ProgramDesc> program_pb(
......@@ -40,25 +47,35 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
std::unordered_set<std::string> visited_vars;
for (ir::Node* n : graph->Nodes()) {
if (n->IsVar()) {
if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) {
if (n->Var() && visited_vars.count(n->Var()->Name()) == 0 &&
!vars2remove.count(n->Var()->Name())) {
visited_vars.insert(n->Var()->Name());
block->add_vars()->MergeFrom(*n->Var()->Proto());
}
}
}
block->clear_ops();
std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
std::vector<ir::Node*> nodes;
if (Has(kGraphToProgramSortKind)) {
// Inference Memory Optimize relays on this branch.
int sort_kind = Get<int>(kGraphToProgramSortKind);
nodes = TopologyVarientSort(
*graph, static_cast<framework::ir::SortKind>(sort_kind));
} else {
nodes = TopologySortOperations(*graph);
}
for (ir::Node* n : nodes) {
if (!n->Op()) {
continue;
}
if (!n->Op()) continue;
block->add_ops()->MergeFrom(*n->Op()->Proto());
}
program.CopyFrom(*program_pb);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
......
......@@ -20,6 +20,10 @@ namespace paddle {
namespace framework {
namespace ir {
const char kGraphToProgramVarsToRemove[] =
"__graph_to_program_vars_to_remove__";
const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
class GraphToProgramPass : public Pass {
protected:
std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/framework/ir/graph_traits.h"
#include <set>
#include <vector>
namespace paddle {
......@@ -79,7 +80,7 @@ NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
}
std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()};
std::set<Node *> to_visit{source.begin(), source.end()};
std::vector<Node *> inlink_visited;
while (!to_visit.empty()) {
......
......@@ -135,4 +135,4 @@ GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes(
} // namespace paddle
REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass)
.RequirePassAttr(paddle::framework::ir::kGraphVizPath);
.RequirePassAttr(paddle::framework::ir::kGraphVizPath);
\ No newline at end of file
......@@ -64,7 +64,7 @@ class Node {
std::string Name() const { return name_; }
VarDesc* Var() {
VarDesc* Var() const {
PADDLE_ENFORCE(IsVar());
return var_desc_.get();
}
......
......@@ -28,10 +28,14 @@ std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.",
attr);
}
auto* native_graph = graph.get();
auto applied_graph = ApplyImpl(std::move(graph));
// TODO(panyx0718): Add more verifications.
PADDLE_ENFORCE(!HasCircle(*applied_graph),
"Illegal Pass. Generated graph shouldn't has cycle.");
PADDLE_ENFORCE(applied_graph.get() == native_graph,
"Pass::Apply() cannot delete the passed graph and shouldn't "
"return a new graph.(For the need of pybind11)");
applied_ = true;
return applied_graph;
}
......
......@@ -54,13 +54,14 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
if (!platform::is_cpu_place(t.place())) {
LoDTensor tt;
framework::TensorCopy(t, platform::CPUPlace(), &tt);
LoDTensor cpu_tensor;
cpu_tensor.set_lod(t.lod());
framework::TensorCopy(t, platform::CPUPlace(), &cpu_tensor);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(t.place());
dev_ctx.Wait();
os << tt;
os << cpu_tensor;
return os;
}
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
......
......@@ -50,8 +50,8 @@ void NaiveExecutor::Run() {
"running Paddle Inference";
#endif // PADDLE_ON_INFERENCE
for (auto &op : ops_) {
VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
<< " on scope " << scope_;
VLOG(4) << std::this_thread::get_id() << " run "
<< op->DebugStringEx(scope_) << " on scope " << scope_;
op->SetIsCalledByExecutor(false);
op->Run(*scope_, place_);
}
......@@ -69,10 +69,12 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
anc = anc->parent();
}
int num_vars = 0;
for (auto &var : global_block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
}
num_vars++;
if (persistable == var->Persistable()) {
if (persistable) {
......@@ -90,6 +92,7 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
}
}
}
VLOG(4) << "naive executor create " << num_vars << " vars";
}
void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/variant.h"
#include "ngraph/type/element_type.hpp"
namespace paddle {
namespace framework {
class NgraphOperator : public OperatorBase {
public:
static std::vector<
std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
NgraphOpIntervals(
std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
explicit NgraphOperator(
const ProgramDesc& prog, size_t block_id,
std::vector<std::unique_ptr<OperatorBase>>::iterator start,
std::vector<std::unique_ptr<OperatorBase>>::iterator end,
const std::string& type = "fused_op", const VariableNameMap& inputs = {},
const VariableNameMap& outputs = {}, const AttributeMap& attrs = {});
void RunImpl(const Scope& scope, const platform::Place& place) const final;
private:
const ProgramDesc pdesc_;
size_t block_;
std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_;
bool is_full_ = false;
void Process();
};
} // namespace framework
} // namespace paddle
......@@ -1073,7 +1073,9 @@ Scope* OperatorWithKernel::PrepareData(
proto::VarType::Type OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const {
int data_type = -1;
proto::VarType::Type dafault_data_type =
static_cast<proto::VarType::Type>(-1);
proto::VarType::Type data_type = dafault_data_type;
for (auto& input : this->inputs_) {
const std::vector<const Variable*> vars = ctx.MultiInputVar(input.first);
for (size_t i = 0; i < vars.size(); ++i) {
......@@ -1090,18 +1092,19 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
if (t != nullptr) {
PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
input.first, i);
int tmp = static_cast<int>(t->type());
proto::VarType::Type tmp = t->type();
PADDLE_ENFORCE(
tmp == data_type || data_type == -1,
tmp == data_type || data_type == dafault_data_type,
"DataType of Paddle Op %s must be the same. Get (%d) != (%d)",
Type(), data_type, tmp);
Type(), DataTypeToString(data_type), DataTypeToString(tmp));
data_type = tmp;
}
}
}
}
PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
return static_cast<proto::VarType::Type>(data_type);
PADDLE_ENFORCE(data_type != dafault_data_type,
"DataType should be indicated by input");
return data_type;
}
OpKernelType OperatorWithKernel::GetExpectedKernelType(
......
......@@ -25,7 +25,8 @@ inline const T* Tensor::data() const {
check_memory_size();
bool valid =
std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType;
PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_);
PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d",
DataTypeToString(type_));
return reinterpret_cast<const T*>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
......
......@@ -206,59 +206,68 @@ framework::LoDTensor& VarBase::GradValue() {
}
std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
if (!grad_op_desc_ && backward_id_ <= 0) {
if (grad_op_descs_.empty() && backward_id_ <= 0) {
LOG(WARNING) << "op with no grad: " << op_desc_->Type();
return {};
}
std::map<std::string, std::vector<framework::Variable*>> grad_outputs;
std::vector<framework::VariableValueMap> grad_outputs;
if (backward_id_ > 0) {
VLOG(3) << "py_layer_grad";
grad_outputs[framework::GradVarName(PyLayer::kFwdOut)] = PyLayer::ApplyGrad(
backward_id_,
grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)]);
grad_outputs.resize(1);
grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] =
PyLayer::ApplyGrad(
backward_id_,
grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]);
} else {
VLOG(3) << "op grad " << grad_op_desc_->Type();
for (auto it : grad_output_vars_) {
auto& outputs = grad_outputs[it.first];
for (size_t i = 0; i < it.second.size(); ++i) {
// Allocate a new variable
Variable* tmp_var = new framework::Variable();
tmp_var->GetMutable<framework::LoDTensor>();
outputs.push_back(tmp_var);
grad_outputs.resize(grad_op_descs_.size());
for (size_t k = 0; k < grad_op_descs_.size(); ++k) {
framework::OpDesc* grad_op_desc = grad_op_descs_[k];
VLOG(3) << "op grad " << grad_op_desc->Type();
for (auto it : grad_output_vars_[k]) {
auto& outputs = grad_outputs[k][it.first];
for (size_t i = 0; i < it.second.size(); ++i) {
// Allocate a new variable
Variable* tmp_var = new framework::Variable();
tmp_var->GetMutable<framework::LoDTensor>();
outputs.push_back(tmp_var);
}
}
}
framework::RuntimeContext ctx(grad_input_vars_, grad_outputs);
framework::RuntimeContext ctx(grad_input_vars_[k], grad_outputs[k]);
// No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_);
grad_op_desc_->InferVarType(block_);
// No need to do compile time infer shape here.
// grad_op_desc_->InferShape(*block_);
grad_op_desc->InferVarType(block_);
std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc_);
framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
std::unique_ptr<framework::OperatorBase> opbase =
framework::OpRegistry::CreateOp(*grad_op_desc);
framework::OperatorWithKernel* op_kernel =
dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
framework::Scope scope;
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
p.op.RuntimeInferShape(scope, place_, ctx);
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
framework::Scope scope;
PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
p.op.RuntimeInferShape(scope, place_, ctx);
p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx));
}
}
for (auto it : grad_output_vars_) {
auto& outputs = grad_outputs[it.first];
auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) {
framework::Variable* grad = outputs[i];
framework::Variable* orig_grad = origin_outputs[i];
AddTo(grad, orig_grad, place_);
delete grad;
for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
for (auto it : grad_output_vars_[k]) {
auto& outputs = grad_outputs[k][it.first];
auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) {
framework::Variable* grad = outputs[i];
framework::Variable* orig_grad = origin_outputs[i];
AddTo(grad, orig_grad, place_);
delete grad;
}
}
}
return input_vars_;
}
......
......@@ -189,12 +189,13 @@ class OpBase {
OpBase()
: op_desc_(nullptr),
forward_id_(-1),
grad_op_desc_(nullptr),
backward_id_(-1),
place_(platform::CPUPlace()) {}
virtual ~OpBase() {
if (grad_op_desc_) delete grad_op_desc_;
for (framework::OpDesc* desc : grad_op_descs_) {
delete desc;
}
}
std::map<std::string, std::vector<VarBase*>> ApplyGrad();
......@@ -203,9 +204,11 @@ class OpBase {
// For pure python PyLayer, use `forward_id_`, otherwise, use op_desc_.
framework::OpDesc* op_desc_;
int forward_id_;
// When has backward, one of `grad_op_desc_` or `backward_id_` is set,
// When has backward, one of `grad_op_descs_` or `backward_id_` is set,
// not both.
framework::OpDesc* grad_op_desc_;
// Note: each fwd op corresponds to a vector of bwd ops.
std::vector<framework::OpDesc*> grad_op_descs_;
int backward_id_;
platform::Place place_;
......@@ -215,8 +218,11 @@ class OpBase {
OpBasePtrMap pre_ops_;
std::map<std::string, std::vector<int>> pre_ops_out_idx_;
framework::VariableValueMap grad_input_vars_;
framework::VariableValueMap grad_output_vars_;
// Inputs to a vector of bwd ops.
std::vector<framework::VariableValueMap> grad_input_vars_;
// Outputs to a vector of bwd ops.
std::vector<framework::VariableValueMap> grad_output_vars_;
framework::BlockDesc* block_;
};
......
......@@ -24,15 +24,16 @@ namespace imperative {
void CreateGradOp(const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block,
framework::OpDesc** grad_op_desc,
std::vector<framework::OpDesc*>* grad_op_descs,
std::unordered_map<std::string, std::string>* grad_to_var) {
std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
PADDLE_ENFORCE(grad_op_descs->empty());
std::vector<std::unique_ptr<framework::OpDesc>> descs =
framework::OpInfoMap::Instance()
.Get(op_desc.Type())
.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now.");
// TODO(panyx0718): Leak?
*grad_op_desc = grad_op_descs[0].release();
for (auto& desc : descs) {
grad_op_descs->emplace_back(desc.release());
}
}
void InitVar(framework::Variable* var, framework::Variable* grad_var,
......@@ -140,49 +141,52 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
prepared_op.op, scope, *prepared_op.dev_ctx, prepared_op.ctx));
if (!stop_gradient) {
framework::OpDesc* grad_op_desc;
// TODO(panyx): Is this leaked?
std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
new std::unordered_map<std::string, std::string>());
CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var.get());
op->grad_op_desc_ = grad_op_desc;
for (auto it : grad_op_desc->Inputs()) {
auto& grad_in_vars = op->grad_input_vars_[it.first];
for (const std::string& grad_invar : it.second) {
block->FindRecursiveOrCreateVar(grad_invar);
auto var_it = grad_to_var->find(grad_invar);
if (var_it == grad_to_var->end()) {
auto fwd_var_it = vars.find(grad_invar);
PADDLE_ENFORCE(fwd_var_it != vars.end());
// Forward inputs or outputs.
grad_in_vars.push_back(fwd_var_it->second->var_);
} else {
CreateGradOp(*op_desc, {}, {block}, &op->grad_op_descs_, grad_to_var.get());
op->grad_input_vars_.resize(op->grad_op_descs_.size());
op->grad_output_vars_.resize(op->grad_op_descs_.size());
for (size_t i = 0; i < op->grad_op_descs_.size(); ++i) {
framework::OpDesc* grad_op_desc = op->grad_op_descs_[i];
for (auto it : grad_op_desc->Inputs()) {
auto& grad_in_vars = op->grad_input_vars_[i][it.first];
for (const std::string& grad_invar : it.second) {
block->FindRecursiveOrCreateVar(grad_invar);
auto var_it = grad_to_var->find(grad_invar);
if (var_it == grad_to_var->end()) {
auto fwd_var_it = vars.find(grad_invar);
PADDLE_ENFORCE(fwd_var_it != vars.end());
// Forward inputs or outputs.
grad_in_vars.push_back(fwd_var_it->second->var_);
} else {
VarBase* var = vars[var_it->second];
if (!var->grads_->var_->IsInitialized()) {
InitVar(var->var_, var->grads_->var_,
prepared_op.GetDeviceContext());
}
// Douts.
grad_in_vars.push_back(var->grads_->var_);
}
}
}
for (auto it : grad_op_desc->Outputs()) {
auto& grad_out_vars = op->grad_output_vars_[i][it.first];
for (const std::string& grad_outvar : it.second) {
block->FindRecursiveOrCreateVar(grad_outvar);
auto var_it = grad_to_var->find(grad_outvar);
PADDLE_ENFORCE(var_it != grad_to_var->end(),
"Could not found the grad op output var, should this "
"operator %s's stop gradient be True",
op_desc->Type());
VarBase* var = vars[var_it->second];
if (!var->grads_->var_->IsInitialized()) {
InitVar(var->var_, var->grads_->var_,
prepared_op.GetDeviceContext());
}
// Douts.
grad_in_vars.push_back(var->grads_->var_);
}
}
}
for (auto it : grad_op_desc->Outputs()) {
auto& grad_out_vars = op->grad_output_vars_[it.first];
for (const std::string& grad_outvar : it.second) {
block->FindRecursiveOrCreateVar(grad_outvar);
auto var_it = grad_to_var->find(grad_outvar);
PADDLE_ENFORCE(var_it != grad_to_var->end(),
"Could not found the grad op output var, should this "
"operator %s's stop gradient be True",
op_desc->Type());
VarBase* var = vars[var_it->second];
if (!var->grads_->var_->IsInitialized()) {
InitVar(var->var_, var->grads_->var_, prepared_op.GetDeviceContext());
grad_out_vars.push_back(var->grads_->var_);
}
grad_out_vars.push_back(var->grads_->var_);
}
}
}
......@@ -211,10 +215,12 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient);
}
if (!stop_gradient) {
op->grad_input_vars_.resize(1);
op->grad_output_vars_.resize(1);
auto& grad_input_vars =
op->grad_input_vars_[framework::GradVarName(PyLayer::kFwdInp)];
op->grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)];
auto& grad_output_vars =
op->grad_output_vars_[framework::GradVarName(PyLayer::kFwdOut)];
op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)];
for (const VarBase* inp : inputs) {
grad_input_vars.push_back(inp->var_);
......
......@@ -18,6 +18,7 @@ cc_library(analysis SRCS
analyzer.cc
analysis_pass
DEPS ${analysis_deps} analysis_helper
${INFER_IR_PASSES}
)
cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
......
......@@ -15,8 +15,8 @@
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
#include "paddle/fluid/inference/analysis/passes/passes.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace inference {
......@@ -24,13 +24,16 @@ namespace analysis {
Analyzer::Analyzer() {}
void Analyzer::Run(Argument *argument) { RunIrAnalysis(argument); }
void Analyzer::Run(Argument *argument) { RunAnalysis(argument); }
void Analyzer::RunIrAnalysis(Argument *argument) {
std::vector<std::string> passes({"ir_analysis_compose_pass"});
for (auto &pass : passes) {
PassRegistry::Global().Retreive(pass)->Run(argument);
void Analyzer::RunAnalysis(Argument *argument) {
PADDLE_ENFORCE(argument->analysis_passes_valid(),
"analsis_passes is not valid in the argument.");
for (auto &pass : argument->analysis_passes()) {
string::PrettyLogH1("--- Running analysis [%s]", pass);
auto *ptr = PassRegistry::Global().Retreive(pass);
PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass);
ptr->Run(argument);
}
}
......
......@@ -54,7 +54,7 @@ class Analyzer final {
DISABLE_COPY_AND_ASSIGN(Analyzer);
protected:
void RunIrAnalysis(Argument* argument);
void RunAnalysis(Argument* argument);
};
} // namespace analysis
......
......@@ -32,6 +32,8 @@ TEST(Analyzer, analysis_without_tensorrt) {
argument.SetModelDir(FLAGS_inference_model_dir);
argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
argument.SetUseGPU(false);
argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass"});
Analyzer analyser;
analyser.Run(&argument);
......@@ -44,6 +46,8 @@ TEST(Analyzer, analysis_with_tensorrt) {
argument.SetModelDir(FLAGS_inference_model_dir);
argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
argument.SetUseGPU(false);
argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass"});
Analyzer analyser;
analyser.Run(&argument);
......
......@@ -28,6 +28,7 @@
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle {
......@@ -110,22 +111,37 @@ struct Argument {
// The overall Scope to work on.
DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope);
// The default program, loaded from disk.
DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc);
// The ir passes to perform in analysis phase.
DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
std::vector<std::string>);
DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses,
std::vector<std::string>);
// Pass a set of op types to enable its mkldnn kernel
DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
std::unordered_set<std::string>);
// Passed from config.
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int);
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
contrib::AnalysisConfig::Precision);
// Memory optimized related.
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool);
DECL_ARGUMENT_FIELD(static_memory_optim_force_update,
StaticMemoryOptimForceUpdate, bool);
// Indicate which kind of sort algorithm is used for operators, the memory
// optimization relays on the sort algorithm.
DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
// The program transformed by IR analysis phase.
DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
......
......@@ -36,6 +36,14 @@ void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
attr->set_i(data);
}
template <>
void SetAttr<bool>(framework::proto::OpDesc *op, const std::string &name,
const bool &data) {
auto *attr = op->add_attrs();
attr->set_name(name);
attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
attr->set_b(data);
}
template <>
void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
const int64_t &data) {
auto *attr = op->add_attrs();
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <sys/stat.h>
#include <cstdio>
#include <fstream>
#include <set>
#include <string>
#include <typeindex>
#include <unordered_map>
......@@ -28,6 +29,18 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
#ifdef _WIN32
#include <direct.h>
#include <io.h>
#define GCC_ATTRIBUTE(attr__) ;
#define MKDIR(path) _mkdir(path)
#else
#include <unistd.h>
#define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)
#endif
#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
namespace paddle {
namespace inference {
namespace analysis {
......@@ -156,6 +169,54 @@ static bool PathExists(const std::string &path) {
return false;
}
static std::string GetDirRoot(const std::string &path) {
char sep = '/';
#ifdef _WIN32
sep = '\\';
#endif
size_t i = path.rfind(sep, path.length());
if (i != std::string::npos) {
return (path.substr(0, i));
}
return path;
}
static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) {
std::string opt_cache_dir = model_root + "/_opt_cache/";
if (!PathExists(opt_cache_dir)) {
PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1,
"Can not create optimize cache directory: %s, Make sure you "
"have permission to write",
opt_cache_dir);
}
return opt_cache_dir;
}
static std::string GetTrtCalibPath(const std::string &model_root,
const std::string &engine_key) {
return model_root + "/trt_calib_" + engine_key;
}
// If there is no calib table data file in model_opt_cache_dir, return "".
static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
const std::string &engine_key,
bool enable_int8) {
std::string trt_calib_table_path =
GetTrtCalibPath(model_opt_cache_dir, engine_key);
if (enable_int8 && FileExists(trt_calib_table_path)) {
VLOG(3) << "Calibration table file: " << trt_calib_table_path
<< "is found here";
std::ifstream infile(trt_calib_table_path, std::ios::in);
std::stringstream buffer;
buffer << infile.rdbuf();
std::string calibration_data(buffer.str());
return calibration_data;
}
return "";
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......
......@@ -67,6 +67,20 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
pass->Set("min_subgraph_size",
new int(argument->tensorrt_min_subgraph_size()));
pass->Set("program",
new framework::ProgramDesc *(&argument->main_program()));
bool enable_int8 = argument->tensorrt_precision_mode() ==
contrib::AnalysisConfig::Precision::kInt8;
pass->Set("enable_int8", new bool(enable_int8));
std::string model_opt_cache_dir =
argument->Has("model_dir")
? argument->model_dir()
: GetDirRoot(argument->model_program_path());
pass->Set(
"model_opt_cache_dir",
new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
}
// graph_ = pass->Apply(std::move(graph_));
......@@ -83,6 +97,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
PADDLE_ENFORCE(graph.get());
// Apply all the passes
for (const auto &pass : passes_) {
if (pass->Type() == "graph_viz_pass") continue;
PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
graph = pass->Apply(std::move(graph));
}
......@@ -90,11 +105,14 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
}
framework::proto::ProgramDesc IRPassManager::AcquireProgram(
std::unique_ptr<Graph> *graph, const ProgramDesc &program) const {
std::unique_ptr<Graph> *graph, ProgramDesc *program) const {
auto pass =
framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
ProgramDesc desc(program);
// Direct using ProgramDesc desc(argument->main_program()) may cause
// incomplete copies of information.
ProgramDesc desc;
desc.CopyFrom(*program->Proto());
pass->SetNotOwned("program", &desc);
auto *the_graph = graph->release();
*graph = pass->Apply(std::unique_ptr<Graph>(the_graph));
......
......@@ -29,6 +29,7 @@
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/argument.h"
#include "paddle/fluid/inference/analysis/helper.h"
namespace paddle {
namespace inference {
......@@ -42,8 +43,8 @@ class IRPassManager final {
std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph);
framework::proto::ProgramDesc AcquireProgram(
std::unique_ptr<Graph> *graph, const ProgramDesc &program) const;
framework::proto::ProgramDesc AcquireProgram(std::unique_ptr<Graph> *graph,
ProgramDesc *program) const;
framework::ir::Graph &graph() const { return *graph_; }
......
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
if (TENSORRT_FOUND)
if (WITH_GPU AND TENSORRT_FOUND)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
set(analysis_deps ${analysis_deps}
......
......@@ -413,7 +413,6 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() {
auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)();
for (auto &subgraph : subgraphs) {
if (subgraph.size() <= (size_t)min_subgraph_size_) continue;
LOG(INFO) << "detect a subgraph size " << subgraph.size();
std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
// replace this sub-graph with the first node. Two steps: 1. Create a Block
// Node that contains this subgraph 2. Mark the nodes inside the sub-graph
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include <algorithm>
#include <set>
#include <string>
#include <vector>
......@@ -21,6 +22,7 @@
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace inference {
......@@ -66,25 +68,54 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
return graph;
}
std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
const std::set<std::string> &engine_outputs) {
std::string engine_hash_key = "";
for (auto name : engine_inputs) {
engine_hash_key += name;
}
for (auto name : engine_outputs) {
engine_hash_key += name;
}
auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
return engine_key;
}
void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
Graph *graph) const {
auto *op_desc = node->Op();
auto &subgraph = *Agent(node).subgraph();
PADDLE_ENFORCE(!subgraph.empty());
framework::ProgramDesc *program_desc =
Get<framework::ProgramDesc *>("program");
// Add new block for TensorRTEngineOP
const framework::BlockDesc &main_block =
program_desc->Block(framework::kRootBlockIndex);
// const framework::BlockDesc& main_block = program_desc->Block(0);
framework::BlockDesc *new_block = program_desc->AppendBlock(main_block);
// An fake block desc.
framework::proto::BlockDesc block_proto;
framework::BlockDesc block_desc(nullptr, &block_proto);
block_desc.Proto()->set_parent_idx(-1);
block_desc.Proto()->set_idx(0);
string::PrettyLogDetail("--- detect a sub-graph with %d nodes",
subgraph.size());
for (auto *node : subgraph) {
auto *new_block_op = new_block->AppendOp();
auto *op = block_desc.AppendOp();
*new_block_op->Proto() = *node->Op()->Proto();
*op->Proto() = *node->Op()->Proto();
}
// collect inputs
std::unordered_set<std::string> input_names;
std::unordered_set<std::string> input_names_with_id;
// Then, we will use the input_names_with_id and output_names_with_id to
// generate the eigine key.
// So, We use set instead of unordered_set here to ensure that the engine key
// is unique.
std::set<std::string> input_names;
std::set<std::string> input_names_with_id;
for (auto *x : node->inputs) {
input_names.insert(x->Name());
input_names_with_id.insert(x->Name() + std::to_string(x->id()));
......@@ -92,8 +123,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
op_desc->SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
std::unordered_set<std::string> output_names;
std::unordered_set<std::string> output_names_with_id;
std::set<std::string> output_names;
std::set<std::string> output_names_with_id;
for (auto *x : node->outputs) {
output_names.insert(x->Name());
output_names_with_id.insert(x->Name() + std::to_string(x->id()));
......@@ -178,7 +209,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
// to Tensor.
std::vector<std::string> output_mapping;
for (auto name : output_names) {
// LOG(INFO) << name << " " << output_name_map.size();
PADDLE_ENFORCE(output_name_map.count(name) != 0);
output_mapping.push_back(output_name_map[name]);
}
......@@ -189,16 +219,29 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
*vars->Add() = *node->Var()->Proto();
}
}
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
"the block has no var-desc");
PADDLE_ENFORCE(!output_mapping.empty());
// Set attrs
op_desc->SetBlockAttr("sub_block", new_block);
SetAttr(op_desc->Proto(), "subgraph",
block_desc.Proto()->SerializeAsString());
// Set attrs
SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
auto enable_int8 = Get<bool>("enable_int8");
auto engine_key =
GenerateEngineKey(input_names_with_id, output_names_with_id);
std::string calibration_data = GetTrtCalibTableData(
Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "engine_key", engine_key);
}
std::vector<std::string> ExtractParameters(
......
cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor)
cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
cc_library(analysis_passes SRCS passes.cc DEPS
ir_graph_build_pass
ir_analysis_pass
ir_params_sync_among_devices_pass
memory_optim_pass
ir_graph_to_program_pass
)
set(analysis_deps ${analysis_deps}
ir_graph_build_pass
ir_analysis_pass
analysis_passes
subgraph_detector
CACHE INTERNAL "")
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
namespace paddle {
......@@ -31,9 +32,18 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
IRPassManager the_ir_manager(argument);
graph = the_ir_manager.Apply(std::move(graph));
PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
argument->SetIrAnalyzedProgram(new framework::proto::ProgramDesc(
the_ir_manager.AcquireProgram(&graph, argument->main_program())));
argument->SetMainGraph(graph.release());
CollectFusionStatis(argument);
}
void IrAnalysisPass::CollectFusionStatis(Argument* argument) {
if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
LOG(INFO) << "argument has no fuse statis";
return;
}
argument->SetFusionStatis(
argument->main_graph().Get<Argument::fusion_statis_t>(
framework::ir::kFuseStatisAttr));
}
std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; }
......
......@@ -29,6 +29,9 @@ namespace analysis {
class IrAnalysisPass : public AnalysisPass {
public:
void RunImpl(Argument* argument) override;
void CollectFusionStatis(Argument* argument);
std::string repr() const override;
};
......
......@@ -12,49 +12,36 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/string/pretty_log.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
namespace inference {
namespace analysis {
void IrAnalysisComposePass::RunImpl(Argument *argument) {
ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
ApplyIrPasses(argument);
CollectFusionStatis(argument);
}
std::string IrAnalysisComposePass::repr() const {
return "ir-analysis-compose-pass";
}
void IrGraphToProgramPass::RunImpl(Argument *argument) {
auto pass =
framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
std::vector<std::string> passes({
"ir_graph_build_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass",
});
for (const auto &pass : passes) {
VLOG(2) << "Run pass " << pass;
auto *the_pass = PassRegistry::Global().Retreive(pass);
the_pass->Run(argument);
if (argument->memory_optim_sort_kind_valid()) {
pass->Set(framework::ir::kGraphToProgramSortKind,
new int(argument->memory_optim_sort_kind()));
}
}
void IrAnalysisComposePass::CollectFusionStatis(Argument *argument) {
if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
LOG(INFO) << "argument has no fuse statis";
return;
}
argument->SetFusionStatis(
argument->main_graph().Get<Argument::fusion_statis_t>(
framework::ir::kFuseStatisAttr));
std::unique_ptr<Graph> graph(argument->main_graph_ptr());
// Direct using ProgramDesc desc(argument->main_program()) may cause
// incomplete copies of information.
framework::ProgramDesc desc;
desc.CopyFrom(*argument->main_program().Proto());
pass->SetNotOwned("program", &desc);
auto thegraph = pass->Apply(std::move(graph));
thegraph.release(); // the argument still own the graph.
argument->SetIrAnalyzedProgram(
new framework::proto::ProgramDesc(*desc.Proto()));
}
} // namespace analysis
......
......@@ -14,31 +14,17 @@
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/passes.h"
namespace paddle {
namespace inference {
namespace analysis {
/*
* The analysis pass to run a list of IR passes (like a function call).
* Currently, it should be the first pass of analysis phase.
*/
class IrAnalysisComposePass : public AnalysisPass {
class IrGraphToProgramPass : public AnalysisPass {
public:
void RunImpl(Argument* argument) override;
std::string repr() const override;
void RunImpl(Argument *argument) override;
private:
void ApplyIrPasses(Argument* argument);
void CollectFusionStatis(Argument* argument);
// Assign a Scope for IR passes to modify the weights.
void AssignScopeToModify(Argument* argument);
std::string repr() const override { return "ir-graph-to-param-pass"; }
};
} // namespace analysis
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/port.h"
namespace paddle {
namespace inference {
namespace analysis {
/*
* Memory optimization pass for inference with pre-analysis of memory usage
* without GC.
* Different from training, the inference memory reuse strategies doesn't
* include GC for that overhead is too much when batch size equals one.
*
* The inference memory reuse tries to pre-determine the tensor reusing strategy
* without runtime overhead.
*
* To improve the strategy's performance, a warm-up running is introduced:
* - Before officially deploy the inference program, one should warm it up and
* generate some runtime cache,
* - Run the inference program with several batches of data, it will persist
* some runtime information about memory of tensors to disk, we call the
* information the memory reusing cache,
* - With the memory reusing cache, user can deploy the inference to a
* service, before running the model, the inference program will load the
* memory cache, analysis it and generate the best memory reusing strategy,
* and adjust the execution of the network.
*
* With the warm-up and memory reusing cache design, the memory reusing
* algorithm can analysis the real memory consume of the tensors, even with the
* flexible LoDTensor and special shape changing operators such as
* sequence-pooling.
*/
class MemoryOptimizePass : public AnalysisPass {
public:
using space_table_t = std::unordered_map<std::string, size_t>;
using lifecycle_t = std::pair<int, int>;
struct MemoryAllocation {
size_t allocated; // allocated memory in byte.
size_t saved; // saved memory in byte.
int sort_kind; // the kind of the corresponding sorting algorithm.
// Get the memory saving ratio of temporary variables.
float GetSavingRatio() const;
};
virtual ~MemoryOptimizePass() = default;
protected:
void RunImpl(Argument *argument) override;
private:
void CollectLifeCycle(
std::unordered_map<std::string, lifecycle_t> *lifecycles,
int sort_kind) const;
void CollectVarMemorySize(
const std::unordered_map<std::string, size_t> &batch_var_ave_dim,
std::unordered_map<std::string, framework::ir::Node *> *tensor_nodes,
space_table_t *space_table) const;
// Returns percentage of saved memory.
void MakeReusePlan(
const std::vector<std::unordered_set<std::string>> &var_clusters,
const std::unordered_map<std::string, size_t> &var_batch_ave_size,
const space_table_t &space_table,
std::unordered_map<std::string, std::string> *reuse_table, int sort_kind,
MemoryAllocation *memory_allocation) const;
void PerformReusePlan(
const std::unordered_map<std::string, std::string> &reuse_table,
int sort_kind, std::unordered_set<std::string> *vars2remove) const;
public:
std::string repr() const override;
private:
mutable framework::ir::Graph *graph_{nullptr};
mutable int max_lifecycle_{-1};
};
static std::string GetMemoryCachePath(const std::string &model_path,
const std::string &prog_path) {
auto path = model_path.empty() ? prog_path : model_path;
return path + ".memory_cache";
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -13,24 +13,31 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/passes.h"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
namespace paddle {
namespace inference {
namespace analysis {
PassRegistry::PassRegistry() {
// Register manually to avoid the trivial `USE_OP` like macro for easier use
// and link.
passes_.emplace("ir_analysis_pass",
std::unique_ptr<AnalysisPass>(new IrAnalysisPass));
passes_.emplace("ir_graph_build_pass",
std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
passes_.emplace("ir_analysis_compose_pass",
std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
passes_.emplace("memory_optimize_pass",
std::unique_ptr<AnalysisPass>(new MemoryOptimizePass));
passes_.emplace(
"ir_params_sync_among_devices_pass",
std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
passes_.emplace(
"ir_graph_to_program_pass",
std::unique_ptr<IrGraphToProgramPass>(new IrGraphToProgramPass));
}
} // namespace analysis
......
......@@ -18,8 +18,10 @@ if(APPLE)
endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass
ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB})
set(inference_deps ${analysis_deps}
paddle_inference_api paddle_fluid_api
analysis pass naive_executor
${GLOB_PASS_LIB})
if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
......@@ -29,7 +31,8 @@ add_subdirectory(details)
cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor
reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps})
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
analysis_config paddle_pass_builder zero_copy_tensor
......@@ -44,7 +47,7 @@ if(WITH_TESTING)
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
endif()
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps}
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
ARGS --dirname=${WORD2VEC_MODEL_DIR})
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
......
......@@ -44,16 +44,22 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const {
contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
model_dir_ = model_dir;
Update();
}
contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
const std::string &params_file) {
prog_file_ = prog_file;
params_file_ = params_file;
Update();
}
void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
const std::string &params_file_path) {
prog_file_ = prog_file_path;
params_file_ = params_file_path;
Update();
}
void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
int device_id) {
......@@ -62,11 +68,17 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
memory_pool_init_size_mb_ = memory_pool_init_size_mb;
device_id_ = device_id;
#else
LOG(ERROR) << "Please compile with gpu to EnableGpu";
LOG(ERROR) << "Please compile with gpu to EnableGpu()";
use_gpu_ = false;
#endif
Update();
}
void contrib::AnalysisConfig::DisableGpu() {
use_gpu_ = false;
Update();
}
void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; }
contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
#define CP_MEMBER(member__) member__ = other.member__;
......@@ -81,11 +93,16 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
CP_MEMBER(use_gpu_);
CP_MEMBER(device_id_);
CP_MEMBER(memory_pool_init_size_mb_);
CP_MEMBER(enable_memory_optim_);
CP_MEMBER(static_memory_optim_);
CP_MEMBER(static_memory_optim_force_update_);
// TensorRT releated.
CP_MEMBER(use_tensorrt_);
CP_MEMBER(tensorrt_workspace_size_);
CP_MEMBER(tensorrt_max_batchsize_);
CP_MEMBER(tensorrt_min_subgraph_size_);
CP_MEMBER(tensorrt_precision_mode_);
// MKLDNN releated.
CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_);
......@@ -109,6 +126,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
}
#undef CP_MEMBER
Update();
}
void contrib::AnalysisConfig::EnableMKLDNN() {
......@@ -119,33 +138,65 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
use_mkldnn_ = false;
#endif
Update();
}
void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
int max_batch_size,
int min_subgraph_size) {
void contrib::AnalysisConfig::EnableTensorRtEngine(
int workspace_size, int max_batch_size, int min_subgraph_size,
contrib::AnalysisConfig::Precision precision_mode) {
#ifdef PADDLE_WITH_CUDA
if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
return;
}
use_tensorrt_ = true;
tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size;
tensorrt_precision_mode_ = precision_mode;
Update();
#else
LOG(ERROR)
<< "To use TensorRT engine, please compile inference lib with GPU first.";
#endif
}
// TODO(Superjomn) refactor this, buggy.
void contrib::AnalysisConfig::Update() {
auto info = SerializeInfoCache();
if (info == serialized_info_cache_) return;
if (use_gpu_) {
pass_builder_.reset(new GpuPassStrategy);
// Transfer pass_builder and copy the existing compatible passes.
if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu()))) {
if (use_gpu()) {
pass_builder_.reset(new GpuPassStrategy);
if (use_tensorrt_) {
// Append after the Affine_channel_conv_fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
}
} else {
pass_builder_.reset(new CpuPassStrategy);
}
} else {
pass_builder_.reset(new CpuPassStrategy);
if (use_gpu()) {
pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(pass_builder_.get())));
} else {
pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(pass_builder_.get())));
}
}
if (use_tensorrt_) {
if (!use_gpu_) {
LOG(ERROR)
<< "TensorRT engine is not available when EnableGpu() not actived.";
} else {
const auto &passes = pass_builder_->AllPasses();
if (std::find(passes.begin(), passes.end(), "tensorrt_subgraph_pass") ==
std::end(passes)) {
// Append after the Affine_channel_conv_fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
}
......@@ -165,6 +216,10 @@ void contrib::AnalysisConfig::Update() {
#endif
}
if (enable_memory_optim_) {
pass_builder()->AppendAnalysisPass("memory_optimize_pass");
}
if (ir_debug_) {
pass_builder()->TurnOnDebug();
}
......@@ -172,24 +227,44 @@ void contrib::AnalysisConfig::Update() {
std::string contrib::AnalysisConfig::SerializeInfoCache() {
std::stringstream ss;
ss << model_dir_;
ss << prog_file_;
ss << params_file_;
ss << use_gpu_;
ss << device_id_;
ss << memory_pool_init_size_mb_;
ss << use_tensorrt_;
ss << tensorrt_workspace_size_;
ss << tensorrt_max_batchsize_;
ss << tensorrt_min_subgraph_size_;
ss << enable_memory_optim_;
ss << static_memory_optim_;
ss << static_memory_optim_force_update_;
ss << use_mkldnn_;
for (auto &item : mkldnn_enabled_op_types_) ss << item;
ss << ";";
ss << model_from_memory_;
ss << enable_ir_optim_;
ss << use_feed_fetch_ops_;
ss << ir_debug_;
ss << specify_input_name_;
ss << cpu_math_library_num_threads_;
return ss.str();
}
void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
int cpu_math_library_num_threads) {
cpu_math_library_num_threads_ = cpu_math_library_num_threads;
Update();
}
float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
......@@ -207,6 +282,19 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#endif
}
void contrib::AnalysisConfig::EnableMemoryOptim(
bool static_optim, bool force_update_static_cache) {
enable_memory_optim_ = true;
static_memory_optim_ = static_optim;
static_memory_optim_force_update_ = force_update_static_cache;
Update();
}
bool contrib::AnalysisConfig::enable_memory_optim() const {
return enable_memory_optim_;
}
void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
size_t prog_buffer_size,
const char *param_buffer,
......@@ -214,6 +302,20 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
model_from_memory_ = true;
Update();
}
NativeConfig contrib::AnalysisConfig::ToNativeConfig() const {
NativeConfig config;
config.model_dir = model_dir_;
config.prog_file = prog_file_;
config.param_file = params_file_;
config.use_gpu = use_gpu_;
config.device = device_id_;
config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
config.specify_input_name = specify_input_name_;
return config;
}
} // namespace paddle
......@@ -15,6 +15,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include <glog/logging.h>
#include <algorithm>
#include <fstream>
#include <memory>
#include <string>
#include <vector>
......@@ -24,23 +25,35 @@
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#endif
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#endif
DECLARE_bool(profile);
namespace paddle {
using contrib::AnalysisConfig;
using inference::Singleton;
#if PADDLE_WITH_TENSORRT
using inference::tensorrt::TRTInt8Calibrator;
using inference::tensorrt::TRTCalibratorEngine;
using inference::tensorrt::TRTCalibratorEngineManager;
#endif
namespace {
bool IsPersistable(const framework::VarDesc *var) {
......@@ -110,6 +123,15 @@ bool AnalysisPredictor::PrepareProgram(
if (!program) {
if (!LoadProgramDesc()) return false;
// If not cloned, the parameters should be loaded.
// If config_.ir_optim() is True, parameters is loaded in
// OptimizeInferenceProgram(), but other persistable variables
// (like RAW type var) are not created in scope.
// If config_.ir_optim() is False, parameters is loaded in LoadParameters(),
// still need to create other persistable variables.
// So in both case, create persistable variables at first.
executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
// Optimize the program, and load parameters and modify them in the
// scope_.
// This will change the scope_ address.
......@@ -117,15 +139,6 @@ bool AnalysisPredictor::PrepareProgram(
status_ir_optim_enabled_ = true;
OptimizeInferenceProgram();
} else {
// If the parent_scope is passed, we assert that the persistable variables
// are already created, so just create the no persistable variables.
// If not cloned, the parameters should be loaded
// OptimizeInferenceProgram.
// So in both cases, just the local variables are needed to load, not the
// parematers.
executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
// Load parameters
LOG(INFO) << "load parameters ";
LoadParameters();
......@@ -189,6 +202,12 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG(ERROR) << "fail to get fetches";
return false;
}
// Collect variable shapes for memory optimization.
if (need_collect_var_shapes_for_memory_optim()) {
CollectVarShapes();
}
VLOG(3) << "predict cost: " << timer.toc() << "ms";
// All the containers in the scope will be hold in inference, but the
......@@ -289,15 +308,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
framework::Scope *scope) {
VLOG(3) << "Predictor::get_fetch";
outputs->resize(fetchs_.size());
for (size_t i = 0; i < fetchs_.size(); ++i) {
int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
outputs->resize(fetches_.size());
for (size_t i = 0; i < fetches_.size(); ++i) {
int idx = boost::get<int>(fetches_[i]->GetAttr("col"));
PADDLE_ENFORCE((size_t)idx == i);
framework::LoDTensor &fetch =
framework::GetFetchVariable(*scope, "fetch", idx);
auto type = fetch.type();
auto output = &(outputs->at(i));
output->name = fetchs_[idx]->Input("X")[0];
output->name = fetches_[idx]->Input("X")[0];
if (type == framework::proto::VarType::FP32) {
GetFetchOne<float>(fetch, output);
output->dtype = PaddleDType::FLOAT32;
......@@ -317,6 +336,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetUseGPU(config_.use_gpu());
argument_.SetGPUDeviceId(config_.gpu_device_id());
argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
argument_.SetStaticMemoryOptim(config_.static_memory_optim_);
argument_.SetStaticMemoryOptimForceUpdate(
config_.static_memory_optim_force_update_);
argument_.SetModelFromMemory(config_.model_from_memory_);
// Analyze inference_program
if (!config_.model_dir().empty()) {
......@@ -326,25 +349,34 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
!config_.params_file().empty(),
"Either model_dir or (param_file, prog_file) should be set.");
PADDLE_ENFORCE(!config_.prog_file().empty());
std::string dir = inference::analysis::GetDirRoot(config_.prog_file());
argument_.SetModelProgramPath(config_.prog_file());
argument_.SetModelParamsPath(config_.params_file());
}
if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
LOG(INFO) << "TensorRT subgraph engine is enabled";
argument_.SetUseTensorRT(true);
argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
}
if (config_.use_mkldnn_) {
LOG(INFO) << "MKLDNN is enabled";
argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
}
auto passes = config_.pass_builder()->AllPasses();
if (!config_.ir_optim()) passes.clear();
if (!config_.ir_optim()) {
passes.clear();
LOG(INFO) << "ir_optim is turned off, no IR pass will be executed";
}
argument_.SetIrAnalysisPasses(passes);
argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
argument_.SetScopeNotOwned(scope_.get());
Analyzer().Run(&argument_);
PADDLE_ENFORCE(argument_.scope_valid());
......@@ -405,10 +437,10 @@ void AnalysisPredictor::PrepareFeedFetch() {
feed_names_[op->Output("Out")[0]] = idx;
} else if (op->Type() == "fetch") {
int idx = boost::get<int>(op->GetAttr("col"));
if (fetchs_.size() <= static_cast<size_t>(idx)) {
fetchs_.resize(idx + 1);
if (fetches_.size() <= static_cast<size_t>(idx)) {
fetches_.resize(idx + 1);
}
fetchs_[idx] = op;
fetches_[idx] = op;
}
}
}
......@@ -550,7 +582,67 @@ bool AnalysisPredictor::LoadParameters() {
return true;
}
#if PADDLE_WITH_TENSORRT
bool AnalysisPredictor::SaveTrtCalibToDisk() {
PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
"This func can be invoked only in trt mode");
auto &block = inference_program_->Block(0);
for (auto &op_desc : block.AllOps()) {
if (op_desc->Type() == "tensorrt_engine") {
std::string engine_name =
boost::get<std::string>(op_desc->GetAttr("engine_key"));
if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_name)) {
LOG(ERROR) << "You should run the predictor(with trt) on the real data "
"to generate calibration info";
return false;
}
TRTCalibratorEngine *calib_engine =
Singleton<TRTCalibratorEngineManager>::Global().Get(engine_name);
LOG(INFO) << "Wait for calib threads done.";
calib_engine->calib_->waitAndSetDone();
LOG(INFO) << "Generating TRT Calibration table data, this may cost a lot "
"of time...";
calib_engine->thr_->join();
std::string calibration_table_data =
calib_engine->calib_->getCalibrationTableAsString();
if (calibration_table_data.empty()) {
LOG(ERROR) << "the calibration table is empty.";
return false;
}
std::string model_opt_cache_dir =
argument_.Has("model_dir")
? argument_.model_dir()
: inference::analysis::GetDirRoot(argument_.model_program_path());
std::string calibration_table_data_path =
inference::analysis::GetTrtCalibPath(
inference::analysis::GetOrCreateModelOptCacheDir(
model_opt_cache_dir),
engine_name);
std::ofstream ofile(calibration_table_data_path, std::ios::out);
LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file "
<< calibration_table_data_path;
ofile << calibration_table_data;
ofile.close();
}
}
// Free all calibrator resources.
Singleton<TRTCalibratorEngineManager>::Global().DeleteALL();
return true;
}
#endif
AnalysisPredictor::~AnalysisPredictor() {
#if PADDLE_WITH_TENSORRT
if (config_.tensorrt_engine_enabled() &&
config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 &&
Singleton<TRTCalibratorEngineManager>::Global().Has()) {
SaveTrtCalibToDisk();
}
#endif
if (FLAGS_profile) {
platform::DisableProfiler(platform::EventSortingKey::kTotal,
"./profile.log");
......@@ -558,6 +650,13 @@ AnalysisPredictor::~AnalysisPredictor() {
if (sub_scope_) {
scope_->DeleteScope(sub_scope_);
}
// TODO(Superjomn) deduce the directory path.
std::string out_path = inference::analysis::GetMemoryCachePath(
config_.model_dir(), config_.prog_file());
if (need_collect_var_shapes_for_memory_optim()) {
SerializeBatchVarShapes(out_path);
}
}
std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
......@@ -567,6 +666,70 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
return std::unique_ptr<PaddlePredictor>(x);
}
void AnalysisPredictor::CollectVarShapes() {
VLOG(4) << "Collecting var shapes";
if (batch_var_shapes_.size() >= max_shape_collect_count_) return;
std::map<std::string, std::vector<int>> var_shapes;
for (auto var_name : inference_program_->Block(0).LocalVarNames()) {
auto *var = sub_scope_->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var);
if (var->Type() == framework::VarTypeTrait<framework::LoDTensor>::kId ||
var->Type() == framework::VarTypeTrait<framework::Tensor>::kId) {
auto &tensor = var->Get<framework::LoDTensor>();
auto shape = framework::vectorize(tensor.dims());
var_shapes[var_name].assign(shape.begin(), shape.end());
}
}
batch_var_shapes_.push_back(var_shapes);
LOG_FIRST_N(INFO, 1) << "Collected " << batch_var_shapes_.size()
<< " batch of var shapes for analysis";
}
void AnalysisPredictor::SerializeBatchVarShapes(const std::string &path) {
LOG(INFO) << "serialize batch var shapes to " << path;
std::ofstream file(path);
if (!file.is_open()) {
LOG(ERROR) << "failed to serialize the var shapes to " << path;
return;
}
// The sirialized data format:
// <tensor_name>:dim0,dim1,dim2,;
for (auto &batch : batch_var_shapes_) {
for (auto &ele : batch) {
file << ele.first << ":";
for (size_t i = 0; i < ele.second.size() - 1; i++) {
file << ele.second[i] << ",";
}
file << ele.second.back() << ";";
}
file << "\n";
}
}
bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
if (need_collect_var_shapes_ >= 0) return need_collect_var_shapes_;
bool need = false;
// check if the cache exists
if (!config_.enable_memory_optim()) {
need = false;
} else if (config_.static_memory_optim_ &&
!inference::IsFileExists(inference::analysis::GetMemoryCachePath(
config_.model_dir(), config_.prog_file()))) {
need = true;
} else if (config_.static_memory_optim_ &&
config_.static_memory_optim_force_update_) {
need = true;
}
need_collect_var_shapes_ = need ? 1 : 0;
return need;
}
std::string AnalysisPredictor::GetSeriazlizedProgram() const {
return inference_program_->Proto()->SerializeAsString();
}
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
const contrib::AnalysisConfig &config) {
......
......@@ -45,6 +45,7 @@ using contrib::AnalysisConfig;
class AnalysisPredictor : public PaddlePredictor {
public:
explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}
~AnalysisPredictor();
bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
const std::shared_ptr<framework::ProgramDesc> &program = nullptr);
......@@ -74,7 +75,14 @@ class AnalysisPredictor : public PaddlePredictor {
void SetMkldnnThreadID(int tid);
std::string GetSeriazlizedProgram() const override;
protected:
// For memory optimization.
bool need_collect_var_shapes_for_memory_optim();
void CollectVarShapes();
void SerializeBatchVarShapes(const std::string &path);
bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
bool CreateExecutor();
......@@ -90,7 +98,21 @@ class AnalysisPredictor : public PaddlePredictor {
template <typename T>
void GetFetchOne(const framework::LoDTensor &fetchs,
PaddleTensor *output_data);
~AnalysisPredictor();
#if PADDLE_WITH_TENSORRT
// When we use Paddle-TRT INT8 engine, we need to generate calibration table
// data first,
// the calibration table contains the range for each op's input and output,
// this whole process can be divided into several steps:
//
// 1. Builds a 32-bit engine, runs it on the calibration set, and records a
// histogram for each
// tensor of the distribution of activation values.
// 2. Builds a calibration table from the histograms.
//
// After step 2, we need to store the calibration table on disk
bool SaveTrtCalibToDisk();
#endif
// Some more detailed tests, they are made the friends of the predictor, so that
// the all the details can be tested.
......@@ -110,7 +132,7 @@ class AnalysisPredictor : public PaddlePredictor {
std::shared_ptr<framework::ProgramDesc> inference_program_;
std::vector<framework::OpDesc *> feeds_;
std::map<std::string, size_t> feed_names_;
std::vector<framework::OpDesc *> fetchs_;
std::vector<framework::OpDesc *> fetches_;
// Memory buffer for feed inputs. The temporary LoDTensor will cause serious
// concurrency problems, wrong results and memory leak, so cache them.
std::vector<framework::LoDTensor> feed_tensors_;
......@@ -118,6 +140,11 @@ class AnalysisPredictor : public PaddlePredictor {
// A mutex help to make Clone thread safe.
std::mutex clone_mutex_;
// For memory optimization.
const size_t max_shape_collect_count_{1000};
int need_collect_var_shapes_{-1}; // -1 for default, 0 for false, 1 for true.
std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
private:
// Some status here that help to determine the status inside the predictor.
bool status_program_optimized_{false};
......
......@@ -16,8 +16,10 @@
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_string(dirname, "", "dirname to tests.");
......@@ -191,4 +193,55 @@ TEST(AnalysisPredictor, Clone) {
}
}
TEST(AnalysisPredictor, memory_optim) {
AnalysisConfig config(FLAGS_dirname);
config.DisableGpu();
config.EnableMemoryOptim(true);
config.pass_builder()->TurnOnDebug();
auto native_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
// 2. Dummy Input Data
int64_t data[4] = {1, 2, 3, 4};
PaddleTensor tensor;
tensor.shape = std::vector<int>({4, 1});
tensor.data.Reset(data, sizeof(data));
tensor.dtype = PaddleDType::INT64;
std::vector<PaddleTensor> inputs(4, tensor);
std::vector<PaddleTensor> output, output1;
{
// The first predictor help to cache the memory optimize strategy.
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram();
ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty());
// Run several times to check the parameters are not reused by mistake.
for (int i = 0; i < 5; i++) {
ASSERT_TRUE(predictor->Run(inputs, &output));
}
}
{
output.clear();
// The second predictor to perform memory optimization.
config.EnableMemoryOptim(false);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
// Run with memory optimization
ASSERT_TRUE(predictor->Run(inputs, &output));
}
// Run native
ASSERT_TRUE(native_predictor->Run(inputs, &output1));
LOG(INFO) << "the output " << inference::DescribeTensor(output.front());
LOG(INFO) << "the native output "
<< inference::DescribeTensor(output1.front());
inference::CompareResult(output, output1);
}
} // namespace paddle
......@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <sstream>
#include "paddle/fluid/framework/commit.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
......@@ -97,4 +99,12 @@ void PaddleBuf::Free() {
}
}
std::string get_version() {
std::stringstream ss;
ss << "version: " << framework::paddle_version() << "\n";
ss << "commit: " << framework::paddle_commit() << "\n";
ss << "branch: " << framework::paddle_compile_branch() << "\n";
return ss.str();
}
} // namespace paddle
......@@ -61,4 +61,10 @@ TEST(paddle_inference_api, demo) {
predictor->Run({}, &outputs);
}
TEST(paddle_inference_api, get_version) {
LOG(INFO) << "paddle version:\n" << get_version();
auto version = get_version();
ASSERT_FALSE(version.empty());
}
} // namespace paddle
#!/bin/bash
set -x
PADDLE_ROOT=$1
TURN_ON_MKL=$2 # use MKL or Openblas
......
......@@ -15,7 +15,10 @@
#pragma once
#include <glog/logging.h>
#include <fstream>
#if !defined(_WIN32)
#include <sys/time.h>
#endif
#include <algorithm>
#include <chrono> // NOLINT
#include <iterator>
......@@ -182,7 +185,8 @@ static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
return true;
}
static std::string DescribeTensor(const PaddleTensor &tensor) {
static std::string DescribeTensor(const PaddleTensor &tensor,
int max_num_of_data = 15) {
std::stringstream os;
os << "Tensor [" << tensor.name << "]\n";
os << " - type: ";
......@@ -253,5 +257,12 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
}
}
static bool IsFileExists(const std::string &path) {
std::ifstream file(path);
bool exists = file.is_open();
file.close();
return exists;
}
} // namespace inference
} // namespace paddle
......@@ -42,6 +42,10 @@ struct AnalysisConfig {
explicit AnalysisConfig(const std::string& model_dir);
explicit AnalysisConfig(const std::string& prog_file,
const std::string& params_file);
enum class Precision {
kFloat32 = 0,
kInt8,
};
/** Set model with a directory.
*/
......@@ -135,7 +139,8 @@ struct AnalysisConfig {
* subgraph is less than this, it will not transfer to TensorRT engine.
*/
void EnableTensorRtEngine(int workspace_size = 1 << 20,
int max_batch_size = 1, int min_subgraph_size = 3);
int max_batch_size = 1, int min_subgraph_size = 3,
Precision precision = Precision::kFloat32);
/** A boolean state telling whether the TensorRT engine is used.
*/
bool tensorrt_engine_enabled() const { return use_tensorrt_; }
......@@ -162,17 +167,7 @@ struct AnalysisConfig {
/** Transform the AnalysisConfig to NativeConfig.
*/
NativeConfig ToNativeConfig() const {
NativeConfig config;
config.model_dir = model_dir_;
config.prog_file = prog_file_;
config.param_file = params_file_;
config.use_gpu = use_gpu_;
config.device = device_id_;
config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool();
config.specify_input_name = specify_input_name_;
return config;
}
NativeConfig ToNativeConfig() const;
/** Specify the operator type list to use MKLDNN acceleration.
* @param op_list the operator type list.
*/
......@@ -192,6 +187,14 @@ struct AnalysisConfig {
*/
bool model_from_memory() const { return model_from_memory_; }
/** Turn on memory optimize
* NOTE still in development, will release latter.
*/
void EnableMemoryOptim(bool static_optim = false,
bool force_update_static_cache = false);
/** Tell whether the memory optimization is activated. */
bool enable_memory_optim() const;
friend class ::paddle::AnalysisPredictor;
/** NOTE just for developer, not an official API, easily to be broken.
......@@ -231,6 +234,12 @@ struct AnalysisConfig {
// We set this variable to control the minimum number of nodes in the
// subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3};
Precision tensorrt_precision_mode_;
// memory reuse related.
bool enable_memory_optim_{false};
bool static_memory_optim_{false};
bool static_memory_optim_force_update_{false};
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
......
......@@ -215,6 +215,14 @@ class PaddlePredictor {
*/
virtual ~PaddlePredictor() = default;
/** \brief Get the serialized model program that executes in inference phase.
* Its data type is ProgramDesc, which is a protobuf message.
*/
virtual std::string GetSeriazlizedProgram() const {
assert(false); // Force raise error.
return "NotImplemented";
};
/** The common configs for all the predictors.
*/
struct Config {
......@@ -288,4 +296,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
int PaddleDtypeSize(PaddleDType dtype);
std::string get_version();
} // namespace paddle
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include <glog/logging.h>
namespace paddle {
......@@ -65,4 +66,8 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG(ERROR) << "GPU not support MKLDNN yet";
}
void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
analysis_passes_.push_back(pass);
}
} // namespace paddle
nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
......
......@@ -29,9 +29,9 @@ TEST(OpConverter, ConvertBlock) {
// init trt engine
cudaStream_t stream_;
std::unique_ptr<TensorRTEngine> engine_;
engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_));
engine_->InitNetwork();
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
engine_.reset(new TensorRTEngine(5, 1 << 15, stream_));
engine_->InitNetwork();
engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims3(2, 5, 5));
......
......@@ -27,8 +27,8 @@ namespace tensorrt {
class TensorRTEngineTest : public ::testing::Test {
protected:
void SetUp() override {
// ASSERT_EQ(0, cudaStreamCreate(&stream_));
engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
ASSERT_EQ(0, cudaStreamCreate(&stream_));
engine_ = new TensorRTEngine(10, 1 << 10, stream_);
engine_->InitNetwork();
}
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册