提交 e9ac7df9 编写于 作者: D dongzhihong

Merge remote-tracking branch 'origin/develop' into op_name

#!/bin/bash
set -e
readonly VERSION="3.8"
version=$(clang-format -version)
if ! [[ $version == *"$VERSION"* ]]; then
echo "clang-format version check failed."
echo "a version contains '$VERSION' is needed, but get '$version'"
echo "you can install the right version, and make an soft-link to '\$PATH' env"
exit -1
fi
clang-format $@
......@@ -19,10 +19,10 @@
- id: end-of-file-fixer
- repo: local
hooks:
- id: clang-format
- id: clang-format-with-version-check
name: clang-format
description: Format files with ClangFormat.
entry: clang-format -i
entry: bash ./.clang_format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang
......
......@@ -4,7 +4,6 @@ cache:
- $HOME/.ccache
- $HOME/.cache/pip
- $TRAVIS_BUILD_DIR/build/third_party
- $TRAVIS_BUILD_DIR/build_android/third_party
sudo: required
dist: trusty
os:
......@@ -12,7 +11,6 @@ os:
env:
- JOB=build_doc
- JOB=check_style
- JOB=build_android
addons:
apt:
packages:
......@@ -23,7 +21,6 @@ addons:
- python
- python-pip
- python2.7-dev
- python-numpy
- python-wheel
- libboost-dev
- curl
......@@ -37,8 +34,8 @@ before_install:
- if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
# Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
# protobuf version.
- pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
- pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
- sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
- sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
- curl https://glide.sh/get | bash
- eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
- go get -u github.com/alecthomas/gometalinter
......
......@@ -36,8 +36,8 @@ include(simd)
################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." OFF)
option(WITH_MKLML "Compile PaddlePaddle with mklml package." OFF)
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND})
option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
......@@ -55,6 +55,7 @@ option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
......@@ -64,8 +65,8 @@ if(NOT CMAKE_BUILD_TYPE)
endif()
if(ANDROID)
if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
endif()
set(WITH_GPU OFF CACHE STRING
......@@ -137,9 +138,9 @@ set(EXTERNAL_LIBS
)
if(WITH_GPU)
list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
endif(NOT WITH_DSO)
endif(WITH_GPU)
......
......@@ -10,13 +10,11 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
ARG WITH_GPU
ARG WITH_AVX
ARG WITH_DOC
ARG WITH_STYLE_CHECK
ENV WOBOQ OFF
ENV WITH_GPU=${WITH_GPU:-OFF}
ENV WITH_GPU=${WITH_GPU:-ON}
ENV WITH_AVX=${WITH_AVX:-ON}
ENV WITH_DOC=${WITH_DOC:-OFF}
ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
ENV HOME /root
# Add bash enhancements
......@@ -34,9 +32,6 @@ RUN apt-get update && \
net-tools && \
apt-get clean -y
# paddle is using numpy.flip, which is introduced since 1.12.0
RUN pip --no-cache-dir install 'numpy>=1.12.0'
# Install Go and glide
RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
tar -xz -C /usr/local && \
......@@ -58,33 +53,22 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
# version util jupyter fixes this issue.
RUN pip install --upgrade pip && \
pip install -U 'protobuf==3.1.0' && \
pip install -U wheel pillow BeautifulSoup && \
pip install -U wheel && \
pip install -U docopt PyYAML sphinx && \
pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
pip install -U sphinx-rtd-theme==0.1.9 recommonmark
RUN pip install pre-commit 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
pip install opencv-python rarfile 'scipy>=0.19.0' 'nltk>=3.2.2'
pip install opencv-python
COPY ./python/requirements.txt /root/
RUN pip install -r /root/requirements.txt
# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
RUN apt-get install -y libssl-dev libffi-dev
RUN pip install certifi urllib3[secure]
# TODO(qijun) The template library Eigen doesn't work well with GCC 5
# coming with the default Docker image, so we switch to use GCC 4.8
# by default. And I will check Eigen library later.
RUN ln -sf gcc-4.8 /usr/bin/gcc && \
ln -sf gcc-ar-4.8 /usr/bin/gcc-ar && \
ln -sf gcc-nm-4.8 /usr/bin/gcc-nm && \
ln -sf gcc-ranlib-4.8 /usr/bin/gcc-ranlib && \
ln -sf gcc-4.8 /usr/bin/x86_64-linux-gnu-gcc && \
ln -sf gcc-ar-4.8 /usr/bin/x86_64-linux-gnu-gcc-ar && \
ln -sf gcc-nm-4.8 /usr/bin/x86_64-linux-gnu-gcc-nm && \
ln -sf gcc-ranlib-4.8 /usr/bin/x86_64-linux-gnu-gcc-ranlib && \
ln -sf g++-4.8 /usr/bin/g++ && \
ln -sf g++-4.8 /usr/bin/x86_64-linux-gnu-g++
# Install woboq_codebrowser to /woboq
RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
......
......@@ -4,9 +4,15 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
# ENV variables
ARG ANDROID_ABI
ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
ENV HOME=/root \
ANDROID_NDK_HOME=/opt/android-ndk-linux \
ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc
ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \
ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain
RUN apt-get update && \
apt-get install -y \
......@@ -15,12 +21,11 @@ RUN apt-get update && \
apt-get clean -y
# Install Go and glide
RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go.tgz && \
RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
tar -xz -C /usr/local && \
mkdir /root/gopath && \
mkdir /root/gopath/bin && \
mkdir /root/gopath/src && \
rm go.tgz
mkdir /root/gopath/src
ENV GOROOT=/usr/local/go GOPATH=/root/gopath
# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
......@@ -42,7 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \
wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
unzip -q android-ndk-r14b-linux-x86_64.zip && \
mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \
rm -rf /opt/android-ndk-tmp && \
rm -rf ${ANDROID_NDK_HOME}
......
......@@ -28,6 +28,10 @@ if(NOT WITH_TIMER)
add_definitions(-DPADDLE_DISABLE_TIMER)
endif(NOT WITH_TIMER)
if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS)
if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
......
......@@ -20,6 +20,7 @@
# The supported variables are listed belows:
#
# ANDROID_STANDALONE_TOOLCHAIN
# ANDROID_TOOLCHAIN
# ANDROID_ABI
# ANDROID_NATIVE_API_LEVEL
# ANDROID_ARM_MODE
......@@ -57,6 +58,10 @@ IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
ENDIF()
ENDIF()
IF(NOT DEFINED ANDROID_TOOLCHAIN)
SET(ANDROID_TOOLCHAIN clang)
ENDIF()
IF(NOT DEFINED ANDROID_ABI)
SET(ANDROID_ABI "armeabi-v7a")
ENDIF()
......@@ -82,6 +87,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
"${CMAKE_VERSION}), when cross-compiling for Android.")
IF(ANDROID_STANDALONE_TOOLCHAIN)
# Use standalone toolchain
SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
IF(NOT CMAKE_SYSTEM_VERSION)
......@@ -96,26 +102,44 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
ENDIF()
# Toolchain
SET(ANDROID_TOOLCHAIN "gcc")
SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
IF(ANDROID_ABI STREQUAL "armeabi")
SET(CMAKE_SYSTEM_PROCESSOR armv5te)
ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
ENDIF()
ENDIF()
IF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
SET(CMAKE_SYSTEM_PROCESSOR aarch64)
ELSE(ANDROID_NDK)
# TODO: use android ndk
ENDIF()
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
IF(ANDROID_ABI STREQUAL "armeabi")
SET(CMAKE_SYSTEM_PROCESSOR armv5te)
SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi)
ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi)
ENDIF()
SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
SET(CMAKE_SYSTEM_PROCESSOR aarch64)
SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android)
ELSE()
MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
ENDIF()
SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
IF(ANDROID_TOOLCHAIN STREQUAL clang)
SET(ANDROID_C_COMPILER_NAME clang)
SET(ANDROID_CXX_COMPILER_NAME clang++)
SET(CMAKE_C_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc)
SET(ANDROID_C_COMPILER_NAME gcc)
SET(ANDROID_CXX_COMPILER_NAME g++)
ELSE()
MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}")
ENDIF()
# C compiler
IF(NOT CMAKE_C_COMPILER)
SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc")
SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}")
ELSE()
GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
ENDIF()
......@@ -125,7 +149,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
# CXX compiler
IF(NOT CMAKE_CXX_COMPILER)
SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++")
SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}")
ELSE()
GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
ENDIF()
......@@ -137,7 +161,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
# Toolchain and ABI specific flags.
SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64")
SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections")
SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
IF(ANDROID_ABI STREQUAL "armeabi")
......@@ -145,8 +169,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
-march=armv5te
-mtune=xscale
-msoft-float)
ENDIF()
IF(ANDROID_ABI STREQUAL "armeabi-v7a")
ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
LIST(APPEND ANDROID_COMPILER_FLAGS
-march=armv7-a
-mfloat-abi=softfp)
......@@ -156,6 +179,8 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
ENDIF()
LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
ENDIF()
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
......@@ -164,10 +189,18 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
ELSE()
LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
ENDIF()
IF(ANDROID_TOOLCHAIN STREQUAL clang)
# Disable integrated-as for better compatibility.
LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as)
ENDIF()
ENDIF()
IF(ANDROID_ABI STREQUAL "arm64-v8a")
LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
IF(ANDROID_TOOLCHAIN STREQUAL clang)
# CMake automatically forwards all compiler flags to the linker,
# and clang doesn't like having -Wa flags being used for linking.
# To prevent CMake from doing this would require meddling with
# the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments)
ENDIF()
STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
......
......@@ -2,7 +2,7 @@ if(NOT WITH_GPU)
return()
endif()
set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
find_path(CUDNN_INCLUDE_DIR cudnn.h
PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
$ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
......
......@@ -51,7 +51,7 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
GIT_TAG "v0.9"
GIT_TAG "v0.10"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
......
......@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml")
SET(MKLML_VER "mklml_lnx_2018.0.20170720")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml")
......@@ -54,7 +54,8 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${MKLML_SOURCE_DIR}
DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate -qO- ${MKLML_URL} | tar xz -C ${MKLML_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz
&& tar zxf ${MKLML_VER}.tgz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
......
......@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
IF(USE_EIGEN_FOR_BLAS)
return()
ENDIF(USE_EIGEN_FOR_BLAS)
INCLUDE(cblas)
IF(NOT ${CBLAS_FOUND})
......@@ -25,7 +29,12 @@ IF(NOT ${CBLAS_FOUND})
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
IF(APPLE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
ELSE()
SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
ENDIF()
IF(CMAKE_CROSSCOMPILING)
IF(ANDROID)
......@@ -40,11 +49,11 @@ IF(NOT ${CBLAS_FOUND})
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(RPI)
# use hardfp
SET(OPENBLAS_COMMIT "v0.2.19")
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0)
ENDIF()
ELSE()
SET(OPENBLAS_COMMIT "v0.2.19")
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS "")
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
......@@ -73,10 +82,18 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
ADD_LIBRARY(cblas STATIC ${dummyfile})
IF(${CBLAS_PROVIDER} MATCHES MKL)
ADD_LIBRARY(cblas SHARED ${dummyfile})
ELSE()
ADD_LIBRARY(cblas STATIC ${dummyfile})
ENDIF()
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
IF(NOT ${CBLAS_FOUND})
ADD_DEPENDENCIES(cblas extern_openblas)
LIST(APPEND external_project_dependencies cblas)
ELSE()
IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
ADD_DEPENDENCIES(cblas mklml)
ENDIF()
ENDIF(NOT ${CBLAS_FOUND})
......@@ -9,13 +9,6 @@ function(CheckCompilerCXX11Flag)
if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
endif()
if(NOT ANDROID)
# TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
# Use Debug mode instead for now.
if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9)
set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
endif()
endif()
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
# cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
# Apple Clang is a different compiler than upstream Clang which havs different version numbers.
......@@ -160,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 --default-stream per-thread)
LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
......
关于PaddlePaddle
================
PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台,兼备易用性、高效性、灵活性和可扩展性,目前已被百度内部多个产品线广泛使用。
PaddlePaddle目前已经开放源码, 但是远未完善,我们希望能在这个基础上不断的改进、扩展和延伸。
同时我们希望广大开发者积极提供反馈和贡献源代码,建立一个活跃的开源社区。
致谢
--------
在此,特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)
ABOUT
=======
PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform,
which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu.
PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended.
We hope to build an active open source community both by providing feedback and by actively contributing to the source code.
Credits
--------
We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/graphs/contributors>`_ of PaddlePaddle!
......@@ -257,6 +257,11 @@ seq_concat
.. autoclass:: paddle.v2.layer.seq_concat
:noindex:
seq_slice
---------
.. autoclass:: paddle.v2.layer.seq_slice
:noindex:
kmax_sequence_score
-------------------
.. autoclass:: paddle.v2.layer.kmax_sequence_score
......@@ -362,6 +367,11 @@ trans
.. autoclass:: paddle.v2.layer.trans
:noindex:
scale_shift
-----------
.. autoclass:: paddle.v2.layer.scale_shift
:noindex:
Sampling Layers
===============
......@@ -409,9 +419,14 @@ multi_binary_label_cross_entropy_cost
.. autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
:noindex:
huber_cost
----------
.. autoclass:: paddle.v2.layer.huber_cost
huber_regression_cost
-------------------------
.. autoclass:: paddle.v2.layer.huber_regression_cost
:noindex:
huber_classification_cost
-------------------------
.. autoclass:: paddle.v2.layer.huber_classification_cost
:noindex:
lambda_cost
......@@ -419,9 +434,9 @@ lambda_cost
.. autoclass:: paddle.v2.layer.lambda_cost
:noindex:
mse_cost
square_error_cost
--------
.. autoclass:: paddle.v2.layer.mse_cost
.. autoclass:: paddle.v2.layer.square_error_cost
:noindex:
rank_cost
......
......@@ -54,17 +54,18 @@ The life cycle of a single task is illustrated below:
<img src="src/paddle-task-states.png"/>
1. When a new pass of training starts, all tasks will be placed in the todo queue.
1. The master server will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion.
1. The trainer will work on its tasks and tell the master server once a task is completed. The master server will dispatch a new task to that trainer.
1. If a task timeout. the master server will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded.
1. Upon trainer requests for new task, the master server will dispatch a task from todo queue to it, put the task in the pending queue and wait for completion.
1. The trainer will work on its task and tell the master server once the task is completed and ask for new task. The master server will dispatch a new task to that trainer.
1. If a task fails for any reason in trainer, or takes longer than a specific period of time, the master server will move the task back to the todo queue. The timeout count for that task will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, then it will be discarded.
1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
### Trainer Process
The trainer process will:
- Receive tasks from the master.
- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
- Request tasks from the master.
- Work on the tasks
- Upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
### Parameter Server Process
......@@ -119,8 +120,8 @@ When the master is started by the Kubernetes, it executes the following steps at
1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
1. Write its ip address to */master/addr* so that trainers can discover it.
1. Listens to trainers' request of task, dispatch one upon request, and updates task queue using an etcd transaction to ensure lock is held during the update.
When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
......@@ -128,13 +129,11 @@ When the master server process is dead for any reason, Kubernetes will restart i
When the trainer is started by the Kubernetes, it executes the following steps at startup:
1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count.
1. Generates a unique ID, and sets key `/trainer/<unique ID>` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline.
1. Waits for tasks from the master to start training.
1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count */ps_desired*.
1. Finds and watches */master/addr* to get master's address.
1. Requests for tasks from the master to start training.
If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master server can discover the trainer again.
When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from the TODO queue and go on training.
When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from master and go on training.
### Parameter Server Process
......
# Alalysis of large model distributed training in Paddle
***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.***
## What is it
We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters.
## How to use
Specify command-line argument like `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1 --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes.
Accrodingly, configure your embedding layers like:
```python
SPARSE_REMOTE=True
w1 = data_layer(name="w1", size=dict_size)
emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
w2 = data_layer(name="w2", size=dict_size)
emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
...
```
## Implementation details
```c++
enum MatType {
MAT_NORMAL,
MAT_NORMAL_SHARED,
MAT_VALUE_SHARED,
MAT_SPARSE_ROW_IDS,
MAT_SPARSE_ROW_AUTO_GROW,
MAT_CACHE_ROW,
MAT_SPARSE_ROW,
MAT_SPARSE_ROW_PREFETCH,
MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
};
```
`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training.
In `trainer_internal.cpp:L93 trainOneBatch`:
```c++
if (config_->getOptConfig().use_sparse_remote_updater()) {
REGISTER_TIMER("prefetch");
gradientMachine_->prefetch(inArgs);
parameterUpdater_->getParametersRemote();
}
```
When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
```c++
if (fullSize) {
...
} else {
getParams = [&] {
parameterClient_->getParameterSparse(
/* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
};
applyL1 = [](Parameter& para, real decayRate) {
para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
};
}
```
Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`:
```c++
void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
std::vector<Buffer>& inputBuffers,
SendParameterResponse* response,
std::vector<Buffer>* outputBuffers) {
(void)inputBuffers;
auto& buffer = *readWriteBuffer_;
size_t numReals = 0;
for (const auto& block : request.blocks()) {
numReals += getParameterConfig(block).dims(1);
}
buffer.resize(numReals);
VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
ReadLockGuard guard(parameterMutex_);
size_t offset = 0;
for (const auto& block : request.blocks()) {
size_t width = getParameterConfig(block).dims(1);
Buffer buf = {buffer.data() + offset, width};
int type = request.send_back_parameter_type();
sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
offset += width;
}
}
```
`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object),
then `getParameterSparse` remote call returns only one row of data to the client.
# Design Doc: Functions, Operators, and Layers
In a DL system, we can compose one or more fine grained operators into a coarse grained one. For example, the FC layer can be composed of a multiplication operator and an add operator.
Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers. But we need a well-defined separation.
In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions:
```c++
template <typename T> T add(T x, T y) { return x + y; }
template <typename T> T mul(T x, T y) { return x * y; }
```
Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name. A C macro can do this. For example, the following macro invocation
```c++
#define MAKE_FUNCTION_OPERATOR(mul);
```
generates
```c++
template <typename T> class mulOp : public OperatorBase {...};
REGISTER_OP(mulOp<float32>, "mul");
```
so that in Python we can create operator mul by:
```python
X1 = Var()
X2 = Var()
Y = Var()
paddle.cpp.create_operator("mul", input=[X1, X2], output=Y)
```
Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`:
```c++
template <typename T>
class FCOp : public OperatorBase {
public:
void Run(...) {
add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b");
}
};
REGISTER_OP(FCOp, "fc");
```
We need to support such composition in Python as well. To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`. This higher level operator API should be compatible with the layer API.
Let's explain using an example. Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`:
```python
def operator.mul(X1, X2):
O = Var()
paddle.cpp.create_operator("mul", input={X1, Y1], output=O)
return O
def operator.add(X1, X2):
O = Var()
paddle.cpp.create_operator("add", input={X1, X2], output=O)
return O
```
Above code snippets are automatically generated. Given them, users can define
```python
def layer.fc(X):
W = Var()
b = Var()
return operator.add(operator.mul(X, W), b)
```
If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated:
```python
def layer.fc(X):
W = Var()
b = Var()
O1 = Var()
paddle.cpp.create_operator("mul", input=[X, W], output=O1)
O2 = Var()
paddle.cpp.create_operator("add", input=[O1, b], output=O2)
return O2
```
We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`. So we have the following concepts in above illustrative example:
| C++ functions/functors | mul | add | | |
|------------------------|--------------|--------------|-------------|----------|
| C++ operator class | mulOp | addOp | FCOp | |
| Python binding | operator.mul | operator.add | operator.fc | |
| Python function | | | | layer.fc |
This is how we differentiate layer and operators in PaddlePaddle:
- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas
- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers.
# Design Doc: Computations as a Graph
A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before.
This document explains that the construction of a graph as three steps:
- construct the forward part
- construct the backward part
- construct the optimization part
## The Construction of a Graph
Let us take the problem of image classification as a simple example. The application program that trains the model looks like:
```python
x = layer.data("images")
l = layer.data("label")
y = layer.fc(x)
cost = layer.mse(y, l)
optimize(cost)
train(cost, reader=mnist.train())
```
### Forward Part
The first four lines of above program build the forward part of the graph.
![](images/graph_construction_example_forward_only.png)
In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x. `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators.
Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once. By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch.
In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`. These protobuf messages are saved in a `BlockDesc` protobuf message.
### Backward Part
The fifth line `optimize(cost)` calls two functions, `ConstructBackwardGraph` and `ConstructOptimizationGraph`.
`ConstructBackwardGraph` traverses the forward graph in the `BlockDesc` protobuf message and builds the backward part.
![](images/graph_construction_example_forward_backward.png)
According to the chain rule of gradient computation, `ConstructBackwardGraph` would
1. create a gradient operator G for each operator F,
1. make all inputs, outputs, and outputs' gradient of F as inputs of G,
1. create gradients for all inputs of F, except for those who don't have gradients, like x and l, and
1. make all these gradients as outputs of G.
### Optimization Part
For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient. Here results in the complete graph:
![](images/graph_construction_example_all.png)
## Block and Graph
The word block and graph are interchangable in the desgin of PaddlePaddle. A [Block[(https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions. A graph of operators and variables is a representation of the block.
A Block keeps operators in an array `BlockDesc::ops`
```protobuf
message BlockDesc {
repeated OpDesc ops = 1;
repeated VarDesc vars = 2;
}
```
in the order that there appear in user programs, like the Python program at the beginning of this article. We can imagine that in `ops`, we have some forward operators, followed by some gradient operators, and then some optimization operators.
IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has M (M<=N) instances, each corresponds to a true element in `cond`.
```python
import paddle as pd
x = var()
y = var()
cond = var()
b = pd.create_ifop(inputs=[x], output_num=1)
with b.true_block():
x = b.inputs(0)
z = operator.add(x, y)
b.set_output(0, operator.softmax(z))
out = b(cond)
```
If we want the output still has N instances, we can use IfElseOp with a default value, whose minibatch size must be N:
```python
import paddle as pd
x = var()
y = var()
cond = var()
default_value = var()
b = pd.create_ifelseop(inputs=[x], output_num=1)
with b.true_block():
x = b.inputs(0)
z = operator.add(x, y)
b.set_output(0, operator.softmax(z))
with b.false_block():
x = b.inputs(0)
z = layer.fc(x)
b.set_output(0, operator.softmax(z))
out = b(cond)
```
If only true_block is set in an IfElseOp, we can have a default value for false as:
```python
import paddle as pd
x = var()
y = var()
cond = var()
default_value = var()
b = pd.create_ifelseop(inputs=[x], output_num=1, default_value)
with b.true_block():
x = b.inputs(0)
z = operator.add(x, y)
b.set_output(0, operator.softmax(z))
out = b(cond)
```
where default_value is a list of vars for `cond` == False.
cat ./graph_construction_example.dot | \
sed 's/color=red/color=red, style=invis/g' | \
sed 's/color=green/color=green, style=invis/g' | \
dot -Tpng > graph_construction_example_forward_only.png
cat ./graph_construction_example.dot | \
sed 's/color=green/color=green, style=invis/g' | \
dot -Tpng > graph_construction_example_forward_backward.png
cat ./graph_construction_example.dot | \
dot -Tpng > graph_construction_example_all.png
digraph ImageClassificationGraph {
///////// The forward part /////////
FeedX [label="Feed", color=blue, shape=box];
FeedY [label="Feed", color=blue, shape=box];
InitW [label="Init", color=blue, shape=diamond];
Initb [label="Init", color=blue, shape=diamond];
FC [label="FC", color=blue, shape=box];
MSE [label="MSE", color=blue, shape=box];
x [label="x", color=blue, shape=oval];
l [label="l", color=blue, shape=oval];
y [label="y", color=blue, shape=oval];
W [label="W", color=blue, shape=doublecircle];
b [label="b", color=blue, shape=doublecircle];
cost [label="cost", color=blue, shape=oval];
FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
FeedY -> l [color=blue];
InitW -> W [color=blue];
Initb -> b [color=blue];
W -> FC [color=blue];
b -> FC [color=blue];
l -> MSE [color=blue];
////////// The backward part /////////
MSE_Grad [label="MSE_grad", color=red, shape=box];
FC_Grad [label="FC_grad", color=red, shape=box];
d_cost [label="d cost", color=red, shape=oval];
d_y [label="d y", color=red, shape=oval];
d_b [label="d b", color=red, shape=oval];
d_W [label="d W", color=red, shape=oval];
cost -> MSE_Grad [color=red];
d_cost -> MSE_Grad [color=red];
x -> MSE_Grad [color=red];
l -> MSE_Grad [color=red];
y -> MSE_Grad -> d_y [color=red];
x -> FC_Grad [color=red];
y -> FC_Grad [color=red];
d_y -> FC_Grad [color=red];
W -> FC_Grad -> d_W [color=red];
b -> FC_Grad -> d_b [color=red];
////////// The optimizaiton part //////////
OPT_W [label="SGD", color=green, shape=box];
OPT_b [label="SGD", color=green, shape=box];
W -> OPT_W [color=green];
b -> OPT_b [color=green];
d_W -> OPT_W -> W [color=green];
d_b -> OPT_b -> b [color=green];
////////// Groupings //////////
subgraph clusterMSE {
style=invis;
MSE;
MSE_Grad;
}
subgraph clusterFC {
style=invis;
FC;
FC_Grad;
}
}
......@@ -101,6 +101,7 @@ if use_mkldnn
5.**Argument**里添加两个`MkldnnMatrixPtr`,取名为`mkldnnValue``mkldnnGrad`,用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑,用于判断`deviceId`,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况,所以需要在保存参数时同时保存该格式信息。目前准备扩展[Header](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247)里面的`int32_t version`。这个值不管是在v1还是在v2里面,一直保存的是0,所以可以充分利用这个信息,定义一个枚举处理所有MKLDNN的参数格式,从而`MKLDNNLayer`就可以从输入的参数中获取需要的格式信息。
## References
......
# Design Doc: Operation Graph Based Parameter Server
## Abstract
We propose an approach to implement the parameter server. In this
approach, there is no fundamental difference between the trainer and
the parameter server: they both run subgraphs, but subgraphs of
different purposes.
## Background
The previous implementations of the parameter server does not run a
subgraph. parameter initialization, optimizer computation, network
communication and checkpointing are implemented twice on both the
trainer and the parameter server.
It would be great if we can write code once and use them on both the
trainer and the parameter server: reduces code duplication and
improves extensibility. Given that after the current refactor, we are
representing everything as a computing graph on the
trainer. Representing everything as a computing graph on the parameter
server becomes a natural extension.
## Design
### Graph Converter
The *graph converter* converts the user-defined operation (OP) graph
into subgraphs to be scheduled on different nodes with the following
steps:
1. OP placement: the OPs will be placed on different nodes according
to heuristic that minimizes estimated total computation
time. Currently we will use a simple heuristic that puts parameter
varable on parameter server workers and everything else on trainer
workers.
1. Add communication OPs to enable the communication between nodes.
We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
Below is an example of converting the user defined graph to the
subgraphs for the trainer and the parameter server:
<img src="src/local-graph.png" width="300"/>
After converting:
<img src="src/dist-graph.png" width="700"/>
1. The parameter variable W and it's optimizer subgraph are placed on the parameter server.
1. Operators are added to the subgraphs.
- *Send* sends data to the connected *Recv* operator. The
scheduler on the receive node will only schedule *Recv* operator
to run when the *Send* operator has ran (the *Send* OP will mark
the *Recv* OP runnable automatically).
- *Enueue* enqueues the input variable, it can block until space
become available in the queue.
- *Dequeue* outputs configurable numbers of tensors from the
queue. It will block until the queue have the required number of
tensors.
### Benefits
- Model parallelism become easier to implement: it's an extension to
the trainer - parameter server approach. we already have the
communication OPs, but need to extend the graph converter's
placement functionality.
- User-defined optimizer is easier to add - user can now express it as
a subgraph.
- No more duplication logic inside the trainer and the parameter
server mentioned in the background section.
### Challenges
- It might be hard for the graph converter to cut a general graph
(without any hint for which subgraph is the optimizer). We may need
to label which subgraph inside the OP graph is the optimizer.
- It's important to balance the parameter shards of on multiple
parameter server. If a single parameter is very big (some
word-embedding, fully connected, softmax layer), we need to
automatically partition the single parameter onto different
parameter servers when possible (only element-wise optimizer depends
on the parameter variable).
### Discussion
- In the "Aync SGD" figure, the "W" variable on the parameter server
could be read and wrote concurrently, what is our locking strategy?
E.g., each variable have a lock cpp method to be invoked by every
OP, or, have a lock OP.
- Can the Enqueue OP be implemented under our current tensor design
(puts the input tensor into the queue tensor)?
- *Dequeue* OP will have variable numbers of output (depends on the
`min_count` attribute), does our current design support it? (similar
question for the *Add* OP)
### References:
[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
......@@ -147,7 +147,7 @@ class CosineOp {
struct CosineOpProtoMaker : public OpProtoMaker {
CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
AddInput("input", "input of cosine op");
AddAttr("scale", "scale of cosine op", float).Default(1.0).LargerThan(0.0);
AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0);
AddType("cos");
AddComment("This is cos op");
}
......
## Background
PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
PaddlePaddle use proto message to describe compile time graph for
1. Computation graph should be able to be saved to a file.
1. In distributed training, the graph will be serialized and send to multiple workers.
The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below.
| |compile time|runtime|
|---|---|---|
|Data|VarDesc(proto)|Variable(cpp)|
|Operation|OpDesc(proto)|Operator(cpp)|
## Definition of VarDesc
A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it.
```proto
message VarDesc {
required string name = 1;
optional LoDTensorDesc lod_tensor = 2;
}
```
## Definition of LodTensorDesc
```proto
enum DataType {
BOOL = 0;
INT16 = 1;
INT32 = 2;
INT64 = 3;
FP16 = 4;
FP32 = 5;
FP64 = 6;
}
message LoDTensorDesc {
required DataType data_type = 1;
repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
optional int32 lod_level = 3 [default=0];
}
```
## Definition of Variable in Python
In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable.
```python
image = Variable(dims=[-1, 640, 480])
# fc1 and fc2 are both Variable
fc1 = layer.fc(input=image, output_size=10)
fc2 = layer.fc(input=fc1, output_size=20)
```
### what should class `Variable` Have
1. `name`.a name of string type is used to mark the value of the Variable.
1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator.
1. `operator`. Variable should record which operator produce itself. The reaon is:
- we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable.
In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph.
```python
import VarDesc
import LoDTensorDesc
import framework
def AddInitialOperator(variable, initializer):
# add an initialize Operator to block to init this Variable
class Variable(object):
def __init__(self, name, dims, type, initializer):
self._block = get_default_block()
self._name = name
self.op = None
tensor_desc = LoDTensorDesc(data_type=type, dims=dims)
_var_desc = VarDesc(name=name, lod_tensor=tensor_desc)
self._var = framework.CreateVar(_var_desc)
self._block.add_var(self)
# add initial op according to initializer
if initializer is not None:
AddInitialOperator(self, initializer)
def dims(self):
return self._var.dims()
def data_type(self):
return self._var.data_type()
def to_proto(self):
pass
```
Then we can use this Variable to create a fc layer in Python.
```python
import paddle as pd
def flatten_size(X, num_flatten_dims):
prod = 1 # of last num_flatten_dims
for i in xrange(num_flatten_dims):
prod = prod * X.dims[-i-1]
return prod
def layer.fc(X, output_size, num_flatten_dims):
W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size])
b = Variable(pd.random_uniform(), type=FP32, dims=[output_size])
out = Variable(type=FP32)
y = operator.fc(X, W, b, output=out) # fc will put fc op input into out
pd.InferShape(y)
return out
x = Variable(dims=[-1, 640, 480])
y = layer.fc(x, output_size=100)
z = layer.fc(y, output_size=200)
paddle.eval(targets=[z], ...)
print(z)
```
......@@ -55,7 +55,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
# 线性计算网络层: ȳ = wx + b
ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
# 计算误差函数,即 ȳ 和真实 y 之间的距离
cost = mse_cost(input= ȳ, label=y)
cost = square_error_cost(input= ȳ, label=y)
outputs(cost)
......@@ -69,7 +69,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
- **数据层**:数据层 `data_layer` 是神经网络的入口,它读入数据并将它们传输到接下来的网络层。这里数据层有两个,分别对应于变量 `x` 和 `y`。
- **全连接层**:全连接层 `fc_layer` 是基础的计算单元,这里利用它建模变量之间的线性关系。计算单元是神经网络的核心,PaddlePaddle支持大量的计算单元和任意深度的网络连接,从而可以拟合任意的函数来学习复杂的数据关系。
- **回归误差代价层**:回归误差代价层 `mse_cost` 是众多误差代价函数层的一种,它们在训练过程作为网络的出口,用来计算模型的误差,是模型参数优化的目标函数。
- **回归误差代价层**:回归误差代价层 `square_error_cost` 是众多误差代价函数层的一种,它们在训练过程作为网络的出口,用来计算模型的误差,是模型参数优化的目标函数。
定义了网络结构并保存为 `trainer_config.py` 之后,运行以下训练命令:
......
......@@ -49,7 +49,7 @@ To recover this relationship between ``X`` and ``Y``, we use a neural network wi
x = data_layer(name='x', size=1)
y = data_layer(name='y', size=1)
y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
cost = mse_cost(input=y_predict, label=y)
cost = square_error_cost(input=y_predict, label=y)
outputs(cost)
Some of the most fundamental usages of PaddlePaddle are demonstrated:
......
......@@ -68,7 +68,7 @@ As a simple example, consider the following:
1. **BLAS Dependencies(optional)**
CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
CMake will search BLAS libraries from the system. If not found, OpenBLAS will be downloaded, built and installed automatically.
To utilize preinstalled BLAS, you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
```bash
......@@ -131,9 +131,9 @@ As a simple example, consider the following:
To build GPU version, you will need the following installed:
1. a CUDA-capable GPU
2. A supported version of Linux with a gcc compiler and toolchain
2. A supported version of Linux with a GCC compiler and toolchain
3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
The CUDA development environment relies on tight integration with the host development environment,
including the host compiler and C runtime libraries, and is therefore only supported on
......@@ -172,6 +172,7 @@ export PATH=<path to install>/bin:$PATH
# install PaddlePaddle Python modules.
sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
```
## <span id="centos">Build on Centos 7</span>
### Install Dependencies
......@@ -192,9 +193,9 @@ sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
To build GPU version, you will need the following installed:
1. a CUDA-capable GPU
2. A supported version of Linux with a gcc compiler and toolchain
2. A supported version of Linux with a GCC compiler and toolchain
3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
The CUDA development environment relies on tight integration with the host development environment,
including the host compiler and C runtime libraries, and is therefore only supported on
......@@ -222,7 +223,7 @@ mkdir build && cd build
```
Finally, you can build and install PaddlePaddle:
```bash
# you can add build option here, such as:
cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
......
......@@ -6,14 +6,12 @@
安装流程
++++++++
PaddlePaddle提供数个预编译的二进制来进行安装,包括Docker镜像,ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境,同时欢迎贡献更多的安装包
PaddlePaddle提供Docker镜像来部署环境
.. toctree::
:maxdepth: 1
docker_install_cn.rst
ubuntu_install_cn.rst
编译流程
......
......@@ -8,14 +8,13 @@ Install PaddlePaddle
:maxdepth: 1
docker_install_en.rst
ubuntu_install_en.rst
Build from Source
-----------------
.. warning::
Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
.. toctree::
:maxdepth: 1
......
Ubuntu部署PaddlePaddle
===================================
PaddlePaddle提供了ubuntu 14.04 deb安装包。
安装
------
安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases
它包含四个版本\:
* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。
* cpu-noavx版本:支持主流x86处理器平台,没有使用avx指令集。
* gpu版本:支持主流x86处理器平台,支持nvidia cuda平台,使用了avx指令集。
* gpu-noavx版本:支持主流x86处理器平台,支持nvidia cuda平台,没有使用avx指令集。
下载完相关安装包后,执行:
.. code-block:: shell
sudo apt-get install gdebi
gdebi paddle-*-cpu.deb
或者:
.. code-block:: shell
dpkg -i paddle-*-cpu.deb
apt-get install -f
在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的,
在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
安装完成后,可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
.. code-block:: shell
PaddlePaddle 0.8.0b1, compiled with
with_avx: ON
with_gpu: OFF
with_double: OFF
with_python: ON
with_rdma: OFF
with_timer: OFF
with_predict_sdk:
可能遇到的问题
--------------
libcudart.so/libcudnn.so找不到
++++++++++++++++++++++++++++++
安装完成后,运行 :code:`paddle train` 报错\:
.. code-block:: shell
0831 12:36:04.151525 1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle,请安装CUDA 7.5 和CUDNN 5到本地环境中,并设置:
.. code-block:: shell
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH
export PATH=/usr/local/cuda/bin:$PATH
Debian Package installation guide
=================================
PaddlePaddle supports :code:`deb` pacakge. The installation of this :code:`deb` package is tested in ubuntu 14.04, but it should be support other debian based linux, too.
There are four versions of debian package, :code:`cpu`, :code:`gpu`, :code:`cpu-noavx`, :code:`gpu-noavx`. And :code:`noavx` version is used to support CPU which does not contain :code:`AVX` instructions. The download url of :code:`deb` package is \: https://github.com/baidu/Paddle/releases/
After downloading PaddlePaddle deb packages, you can use :code:`gdebi` install.
.. code-block:: bash
gdebi paddle-*.deb
If :code:`gdebi` is not installed, you can use :code:`sudo apt-get install gdebi` to install it.
Or you can use following commands to install PaddlePaddle.
.. code-block:: bash
dpkg -i paddle-*.deb
apt-get install -f
And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences.
......@@ -8,7 +8,7 @@ paddle.init(use_gpu=False)
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.mse_cost(input=y_predict, label=y)
cost = paddle.layer.square_error_cost(input=y_predict, label=y)
# create parameters
parameters = paddle.parameters.create(cost)
......
......@@ -81,9 +81,9 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和
.. code-block:: bash
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
cost = paddle.layer.mse_cost(input=y_predict, label=y)
cost = paddle.layer.square_error_cost(input=y_predict, label=y)
其中,x与y为之前描述的输入层;而y_predict是接收x作为输入,接上一个全连接层;cost接收y_predict与y作为输入,接上方误差层。
其中,x与y为之前描述的输入层;而y_predict是接收x作为输入,接上一个全连接层;cost接收y_predict与y作为输入,接上方误差层。
最后一层cost中记录了神经网络的所有拓扑结构,通过组合不同的layer,我们即可完成神经网络的搭建。
......@@ -147,4 +147,4 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和
.. literalinclude:: src/train.py
:linenos:
有关线性回归的实际应用,可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
\ No newline at end of file
有关线性回归的实际应用,可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
# 编译PaddlePaddle和运行单元测试
## 需要的软硬件
为了开发PaddlePaddle,我们需要
1. 一台电脑,可以装的是 Linux, BSD, Windows 或者 MacOS 操作系统,以及
1. Docker。
不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要,因为我们会把所有编译工具都安装进一个 Docker image 里。
## 总体流程
1. 获取源码
```bash
git clone https://github.com/paddlepaddle/paddle
```
2. 安装开发工具到 Docker image 里
```bash
cd paddle; docker build -t paddle:dev .
```
请注意这个命令结尾处的 `.`;它表示 `docker build` 应该读取当前目录下的 [`Dockerfile`文件](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile),按照其内容创建一个名为 `paddle:dev` 的 Docker image,并且把各种开发工具安装进去。
3. 编译
以下命令启动一个 Docker container 来执行 `paddle:dev` 这个 Docker image,同时把当前目录(源码树根目录)映射为 container 里的 `/paddle` 目录,并且运行 `Dockerfile` 描述的默认入口程序 [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `cmake``make` 来编译 `/paddle` 里的源码,结果输出到 `/paddle/build`,也就是本地的源码树根目录里的 `build` 子目录。
```bash
docker run --rm -v $PWD:/paddle paddle:dev
```
上述命令编译出一个 CUDA-enabled 版本。如果我们只需要编译一个只支持 CPU 的版本,可以用
```bash
docker run --rm -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
```
4. 运行单元测试
用本机的第一个 GPU 来运行包括 GPU 单元测试在内的所有单元测试:
```bash
NV_GPU=0 nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
```
如果编译的时候我们用了 `WITH_GPU=OFF` 选项,那么编译过程只会产生 CPU-based 单元测试,那么我们也就不需要 nvidia-docker 来运行单元测试了。我们只需要:
```bash
docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
```
有时候我们只想运行一个特定的单元测试,比如 `memory_test`,我们可以
```bash
nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
```
5. 清理
有时候我们会希望清理掉已经下载的第三方依赖以及已经编译的二进制文件。此时只需要:
```bash
rm -rf build
```
## 为什么要 Docker 呀?
- 什么是 Docker?
如果您没有听说 Docker,可以把它想象为一个类似 virtualenv 的系统,但是虚拟的不仅仅是 Python 的运行环境。
- Docker 还是虚拟机?
有人用虚拟机来类比 Docker。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
- 为什么用 Docker?
把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题,其他人可以复现问题以便帮助。
另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。
- 我可以选择不用Docker吗?
当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式,把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程,是因为这个流程比其他方法都更简便。
- 学习 Docker 有多难?
理解 Docker 并不难,大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
- 我可以用 IDE 吗?
当然可以,因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码,我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
```emacs
(global-set-key "\C-cc" 'compile)
(setq compile-command
"docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
```
就可以按 `Ctrl-C``c` 键来启动编译了。
- 可以并行编译吗?
是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
## 可能碰到的问题
- Docker 需要 sudo
如果用自己的电脑开发,自然也就有管理员权限(sudo)了。如果用公用的电脑开发,需要请管理员安装和配置好 Docker。此外,PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术,比如 rkt。
- 在 Windows/MacOS 上编译很慢
Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)
- 磁盘不够
本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
# Build PaddlePaddle from Source Code and Run Unit Test
## What Developers Need
To contribute to PaddlePaddle, you need
1. A computer -- Linux, BSD, Windows, MacOS, and
1. Docker.
Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image. We run all the tools by running this image.
## General Process
1. Retrieve source code.
```bash
git clone https://github.com/paddlepaddle/paddle
```
2. Install build tools into a Docker image.
```bash
cd paddle; docker build -t paddle:dev .
```
Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile). `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it.
3. Build from source.
This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile. `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer.
```bash
docker run -v $PWD:/paddle paddle:dev
```
Above command builds a CUDA-enabled version. If we want to build a CPU-only version, we can type
```bash
docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
```
4. Run unit tests.
To run all unit tests using the first GPU of a node:
```bash
NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
```
If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them. We can just run
```bash
docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
```
Sometimes we want to run a specific unit test, say `memory_test`, we can run
```bash
nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
```
5. Clean Build.
Sometimes, we might want to clean all thirt-party dependents and built binaries. To do so, just
```bash
rm -rf build
```
## Docker, Or Not?
- What is Docker?
If you haven't heard of it, consider it something like Python's virtualenv.
- Docker or virtual machine?
Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
- Why Docker?
Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
- Can I choose not to use Docker?
Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer. This document exists because Docker would make the development way easier.
- How difficult is it to learn Docker?
It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have.
- Can I use my favorite IDE?
Yes, of course. The source code resides on your local computer, and you can edit it using whatever editor you like.
Many PaddlePaddle developers are using Emacs. They add the following few lines into their `~/.emacs` configure file:
```emacs
(global-set-key "\C-cc" 'compile)
(setq compile-command
"docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
```
so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
- Does Docker do parallel building?
Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
## Some Gotchas
- Docker requires sudo
An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly. If you use a shared computer for development, please ask the administrator to install and configure Docker. We will do our best to support rkt, another container technology that doesn't require sudo.
- Docker on Windows/MacOS builds slowly
On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
- Not enough disk space
Examples in this article uses option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
# 如何写新的Operator
- [概念简介](#概念简介)
- [实现C++类](#实现C++类)
- [定义ProtoMaker类](#定义ProtoMaker类)
- [定义Operator类](#定义Operator类)
- [定义OpKernel类](#定义OpKernel类)
- [注册Operator](#注册Operator)
- [编译](#编译)
- [绑定Python](#绑定Python)
- [实现单元测试](#实现单元测试)
- [前向Operator单测](#前向Operator单测)
- [反向Operator单测](#反向Operator单测)
- [编译和执行](#编译和执行)
## 概念简介
简单介绍需要用到基类,详细介绍请参考设计文档。
- `framework::OperatorBase`: Operator(简写,Op)基类。
- `framework::OpKernel`: Op计算函数的基类,称作Kernel。
- `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。
- `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorBase`,后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下:
内容 | 定义位置
-------------- | :----------------------
OpProtoMake定义 | `.cc`文件,Backward Op不需要定义OpProtoMake
Op定义 | `.cc`文件
Kernel实现 | CPU、GPU共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,GPU 实现在`.cu`文件中。
注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,GPU实现在`.cu`文件中
实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc``*_op.cu`(如有)结尾。
下面以矩阵乘操作,即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
## 实现C++类
### 1. 定义ProtoMaker类
矩阵乘法的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。
首先定义`ProtoMaker`来描述该Op的输入、输出,并添加注释:
```cpp
class MulOpMaker : public framework::OpProtoAndCheckerMaker {
public:
MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The first input of mul op");
AddInput("Y", "The second input of mul op");
AddOutput("Out", "The output of mul op");
AddComment(R"DOC(
Two Element Mul Operator.
The equation is: Out = X * Y
)DOC");
}
};
```
[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`,构造函数含有2个参数:
- `framework::OpProto` : 前者存储Op的输入输出和参数属性,将用于Python API接口的生成。
- `framework::OpAttrChecker` :后者用于检查参数属性的合法性。
构造函数里通过`AddInput`添加输入参数,通过`AddOutput`添加输出参数,通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
上面的代码在`MulOp`中添加两个输入`X``Y`,添加了一个输出`Out`,并解释了各自含义,命名请遵守命名规范。
再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例:
```cpp
template <typename AttrType>
class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input tensor of scale operator.").NotInGradient();
AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
AddComment(R"DOC(Scale operator
The equation is: Out = scale*X
)DOC");
AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
}
};
```
这个例子有两处不同:
- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中,如果Op的某个输入不参与反向梯度的计算,请显示地调用`.NotInGradient()`进行设置。
- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。
### 2. 定义Operator类
下面的点实现了MulOp的定义:
```cpp
class MulOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
auto dim0 = ctx.Input<Tensor>("X")->dims();
auto dim1 = ctx.Input<Tensor>("Y")->dims();
PADDLE_ENFORCE_EQ(dim0.size(), 2,
"input X(%s) should be a tensor with 2 dims, a matrix",
ctx.op_.Input("X"));
PADDLE_ENFORCE_EQ(dim1.size(), 2,
"input Y(%s) should be a tensor with 2 dims, a matrix",
ctx.op_.Input("Y"));
PADDLE_ENFORCE_EQ(
dim0[1], dim1[0],
"First matrix's width must be equal with second matrix's height.");
ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
}
};
```
[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel``public`成员:
```cpp
using framework::OperatorWithKernel::OperatorWithKernel;
```
这句表示使用基类`OperatorWithKernel`的构造函数,也可写成:
```cpp
MulOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
```
还需要重写`InferShape`接口。`InferShape`为const函数,不能修改Op的成员变量,参数为`const framework::InferShapeContext &ctx`,通过该参数可获取到输入输出以及属性。它的功能是:
- 1). 做检查, 尽早报错:检查输入数据维度、类型等是否合法。
- 2). 设置输出Tensor的形状。
通常`OpProtoMaker``Op`类的定义写在`.cc`文件中,和下面将要介绍的注册函数一起放在`.cc`
### 3. 定义OpKernel类
`MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数:
- `typename Place`: 表示设备类型,不同设备(CPU、GPU)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)
- `typename T` : 表示数据类型,如`float`, `double`等。
需要为`MulKernel`类重写`Compute`接口。
- `Compute`接受一个输入参数:`const framework::ExecutionContext& context`
-`InferShapeContext`相比,`ExecutionContext`增加了设备类型,同样可获取到输入输出和属性参数。
- `Compute`函数里实现`OpKernel`的具体计算逻辑。
下面是 `MulKernel` `Compute`的实现:
```cpp
template <typename Place, typename T>
class MulKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<Tensor>("X");
auto* Y = context.Input<Tensor>("Y");
auto* Z = context.Output<Tensor>("Out");
Z->mutable_data<T>(context.GetPlace());
auto* device_context =
const_cast<platform::DeviceContext*>(context.device_context_);
math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
}
};
```
需要注意:**不同设备(CPU、GPU)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。**
`MulOp`的CPU、GPU实现共享同一个`Kernel``OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)
为了使`OpKernel`的计算过程书写更加简单,并且CPU、GPU的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)
到此,前向Op实现完成。接下来,需要在`.cc`文件中注册该op和kernel。
反向Op类的定义,反向OpKernel的定义与前向Op类似,这里不再赘述。**但需注意反向Op没有`ProtoMaker`**
### 4. 注册Operator
-`.cc`文件中注册前向、反向Op类,注册CPU Kernel。
```cpp
namespace ops = paddle::operators;
REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(mul_grad,
ops::MulGradKernel<paddle::platform::CPUPlace, float>);
```
在上面的代码中:
- `REGISTER_OP` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker`为`ops::MulOpMaker`,注册`ops::MulOpGrad`,类型名为`mul_grad`。
- `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op。
- `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulKernel`类。
-`.cu`文件中注册GPU Kernel。
- 请注意,如果GPU Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下:
```cpp
// if use Eigen unsupported module before include head files
#define EIGEN_USE_GPU
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(mul_grad,
ops::MulGradKernel<paddle::platform::GPUPlace, float>);
```
### 5. 编译
- 简单**无特殊依赖**的OP无需修改CMakeList.txt文件。[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt) 会自动将 `paddle/operators` 目录下新增的 `*_op.cc` 文件加入编译。
- 较为复杂、**有额外依赖** 的operator仍需要修改[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)。如,`mul_op` 依赖 `math_function`,需要在`CMakeLists.txt`中添加如下内容:
```
op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function) +
```
- 运行下面命令可以进行编译:
```
make mul_op
```
## 绑定Python
- 绑定Python
在 [`paddle/pybind/pybind.cc
`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) 使用`USE_OP`告知编译器需要链接的Op,具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
```
USE_OP(mul);
```
如果只实现了CPU版本,则使用`USE_CPU_ONLY_OP`:
```
USE_CPU_ONLY_OP(gather);
```
如果OP不带Kernel,则使用`USE_NO_KENREL_OP`:
```
USE_NO_KENREL_OP(recurrent);
```
- 生成库
无需修改 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件,`paddle/operators` 目录下新增的 `*_op.cc` 文件会被自动添加链接到生成的lib库中。
## 实现单元测试
单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
### 前向Operator单元测试
前向Op单元测试继承自`unittest.TestCase`,并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator,需要:
1. 在`setUp`函数定义输入、输出,以及相关的属性参数。
2. 生成随机的输入数据。
3. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比。
```python
import unittest
import numpy as np
from gradient_checker import GradientChecker, create_op
from op_test_util import OpTestMeta
class TestMulOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "mul"
self.inputs = {
'X': np.random.random((32, 84)).astype("float32"),
'Y': np.random.random((84, 100)).astype("float32")
}
self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
```
上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释:
- `self.type = "mul" ` : 定义类型,与operator注册时注册的类型一致。
- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。
- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。
### 反向Operator单元测试
反向Op单元测试继承自`GradientChecker`,而`GradientChecker`继承自`unittest.TestCase`,因此,**反向单元测试函数需要以`test_`开头**。
```python
class TestMulGradOp(GradientChecker):
def setUp(self):
self.op = create_op("mul")
self.inputs = {
'X': np.random.random((32, 84)).astype("float32"),
'Y': np.random.random((84, 100)).astype("float32")
}
def test_cpu_gpu_compare(self):
self.compare_grad(self.op, self.inputs)
def test_normal(self):
# mul op will enlarge the relative error
self.check_grad(
self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
def test_ignore_x(self):
self.check_grad(
self.op,
self.inputs, ["Y"],
"Out",
max_relative_error=0.5,
no_grad_set={"X"})
def test_ignore_y(self):
self.check_grad(
self.op,
self.inputs, ["X"],
"Out",
max_relative_error=0.5,
no_grad_set={"Y"})
```
下面解释代码中一些关键的地方:
- 调用`create_op("mul")`创建反向Op对应的前向Op。
- 调用`compare_grad`函数对比CPU、GPU计算结果。
- `test_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
- 第一个参数`self.op` : 前向Op。
- 第二个参数`self.inputs` : 输入词典,词典的Key和`ProtoMaker`定义保持一致。
- 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
- 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
- `test_ignore_x`和`test_ignore_y`分支用来测试只需要计算一个输入梯度的情况。
### 编译和执行单元测试
单元测试编写完成之后,在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)中添加以下内容,将单元测试加入工程:
```
py_test(test_mul_op SRCS test_mul_op.py)
```
请注意,**不同于Op的编译测试,运行单元测试测时需要编译整个工程**,并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后,执行下面的命令来运行单元测试:
```bash
make test ARGS="-R test_mul_op -V"
```
或者:
```bash
ctest -R test_mul_op
```
## 在Paddle中如何使用Eigen
神经网络本质上是一个计算图,计算需要的数据存放在`Tensor`中,而计算过程是由`Operartor`来描述的。在执行时,`Operator`调用对应`OpKernel`中的`Compute`接口,实现对`Tensor`的操作。
### Eigen Tensor模块
Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。
关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md)[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
### paddle::framework::Tensor
Paddle Tensor定义在framework目录下,其主要接口如下:
```cpp
class Tensor {
public:
/*! Return a pointer to mutable memory block. */
template <typename T>
inline T* data();
/**
* @brief Return a pointer to mutable memory block.
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(platform::Place place);
/**
* @brief Return a pointer to mutable memory block.
*
* @param[in] dims The dimensions of the memory block.
* @param[in] place The place of the memory block.
*
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(DDim dims, platform::Place place);
/*! Resize the dimensions of the memory block. */
inline Tensor& Resize(const DDim& dims);
/*! Return the dimensions of the memory block. */
inline const DDim& dims() const;
private:
/*! holds the memory block if allocated. */
std::shared_ptr<Placeholder> holder_;
/*! points to dimensions of memory block. */
DDim dim_;
};
```
`Placeholder`的作用是延迟分配内存,即我们可以先定义一个Tensor,然后使用Resize接口设置Tensor的大小,最后再调用mutable_data接口分配实际的内存。
```cpp
paddle::framework::Tensor t;
paddle::platform::CPUPlace place;
// set size first
t.Resize({2, 3});
// allocate memory on CPU later
t.mutable_data(place);
```
### paddle::framework::Tensor使用样例
下面以AddOp为例说明Tensor的使用过程:
- InferShape
在运行神经网络计算图时,我们先调用每个`Operator``InferShape`接口,根据输入Tensor的大小来设置输出Tensor的大小,`Resize`接口会被调用。
```cpp
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
ctx.Input<Tensor>("Y")->dims(),
"Two input of Add Op's dimension must be same.");
ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
}
```
- Run
`Operator``Run`接口最终会调用对应`OpKernel``Compute`接口,在这时真正的分配内存,`mutable_data`接口会被调用。
```cpp
void Compute(const framework::ExecutionContext& context) const override {
auto* input0 = context.Input<Tensor>("X");
auto* input1 = context.Input<Tensor>("Y");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());
auto x = EigenVector<T>::Flatten(*input0);
auto y = EigenVector<T>::Flatten(*input1);
auto z = EigenVector<T>::Flatten(*output);
auto place = context.GetEigenDevice<Place>();
z.device(place) = x + y;
}
```
### paddle::framework::Tensor到EigenTensor的转换
如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
以EigenTensor为例,做一个介绍
```cpp
Tensor t;
float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
for (int i = 0; i < 1 * 2 * 3; i++) {
p[i] = static_cast<float>(i);
}
EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
```
From是EigenTensor模板提供的一个接口,可以实现从paddle::framework::Tensor到对EigenTensor的转换。由于Tensor的rank是模板参数,因此在转换时需要显示的指定。
在Eigen中,不同rank的Tensor是不同类型,Vector是rank为1的Tensor。需要额外注意的是,EigenVector<T>::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor,在这里用EigenVector来表示;而EigenVector<T>::Flatten方法是把paddle中的一个Tensor进行reshape操作,压扁成为Eigen的一维Tensor,类型仍然为EigenVector。
更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc)
### 实现计算
当需要完成计算时,我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是,这里的EigenTensor之间的运算只是改变了原有Tensor中的数据,而不会改变原有Tensor的shape信息。
```cpp
auto x = EigenVector<T>::Flatten(*input0);
auto y = EigenVector<T>::Flatten(*input1);
auto z = EigenVector<T>::Flatten(*output);
auto place = context.GetEigenDevice<Place>();
z.device(place) = x + y;
```
在这段代码中,input0/input1/output可以是任意维度的Tensor。我们调用了EigenVector的Flatten接口,把任意维度的Tensor转为了一维的EigenVector。而在计算结束之后,input0/input1/output的原有shape信息不变。如果想改变原有Tensor的shape信息,可以调用Resize接口进行改变。
由于Eigen Tensor模块的文档较少,我们可以参考TensorFlow的[kernels](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels)模块下的相关`OpKernel`的计算代码。
......@@ -19,6 +19,7 @@
.. toctree::
:maxdepth: 1
dev/build_cn.rst
dev/write_docs_cn.rst
dev/contribute_to_paddle_cn.md
......
......@@ -18,6 +18,7 @@ Development
.. toctree::
:maxdepth: 1
dev/build_en.rst
dev/new_layer_en.rst
dev/contribute_to_paddle_en.md
......
......@@ -213,7 +213,7 @@ I1116 09:10:17.123440 50 Util.cpp:130] Calling runInitFunctions
I1116 09:10:17.123764 50 Util.cpp:143] Call runInitFunctions done.
[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__mse_cost_0__]
[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__]
I1116 09:10:17.392917 50 Trainer.cpp:170] trainer mode: Normal
I1116 09:10:17.613910 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
I1116 09:10:17.680917 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
......
......@@ -7,4 +7,3 @@ PaddlePaddle Documentation
getstarted/index_en.rst
howto/index_en.rst
api/index_en.rst
about/index_en.rst
......@@ -63,13 +63,24 @@ func WithAddr(addr string) func(c *Client) error {
// WithEtcd sets the client to use etcd for master discovery.
func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
return func(c *Client) error {
cli, err := clientv3.New(clientv3.Config{
Endpoints: endpoints,
DialTimeout: timeout,
})
if err != nil {
var cli *clientv3.Client
f := func() error {
var err error
cli, err = clientv3.New(clientv3.Config{
Endpoints: endpoints,
DialTimeout: timeout,
})
return err
}
for {
err := f()
if err != nil {
log.Warningln(err)
} else {
break
}
time.Sleep(time.Second)
}
ch := make(chan string, 1)
a, err := GetKey(cli, DefaultAddrPath, timeout)
......@@ -101,9 +112,6 @@ func NewClient(opts ...func(*Client) error) (*Client, error) {
}
}
c.ch = make(chan record, c.bufSize)
// FIXME: connection is created asyncrosly in monitorMaster go routine,
// ensure the connection is ready for use before calling c.addClient.
time.Sleep(time.Second)
return c, nil
}
......
......@@ -15,6 +15,7 @@ if(Boost_FOUND)
add_subdirectory(platform)
add_subdirectory(framework)
add_subdirectory(operators)
add_subdirectory(pybind)
endif()
if(WITH_C_API)
......
......@@ -53,7 +53,10 @@ add_custom_target(paddle_capi_whole ALL
set_target_properties(paddle_capi_whole
PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library})
set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
# TODO: merge mkl into paddle_capi_shared
add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
set_target_properties(paddle_capi_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
link_paddle_exe(paddle_capi_shared)
......
{
global:
paddle_*;
local:
*;
};
......@@ -146,3 +146,19 @@ paddle_error paddle_gradient_machine_randomize_param(
m->machine->randParameters();
return kPD_NO_ERROR;
}
paddle_error paddle_gradient_machine_get_layer_output(
paddle_gradient_machine machine,
const char* layerName,
paddle_arguments args) {
auto m = cast(machine);
auto out = paddle::capi::cast<paddle::capi::CArguments>(args);
if (m == nullptr || layerName == nullptr || out == nullptr ||
m->machine == nullptr) {
return kPD_NULLPTR;
}
auto layerOutput = m->machine->getLayerOutput(layerName);
out->args.push_back(layerOutput);
return kPD_NO_ERROR;
}
......@@ -39,7 +39,11 @@ PD_API paddle_error paddle_gradient_machine_create_for_inference(
/**
* @brief Create a gradient machine used for model inference, using config with
* parameters which is generated by `paddle merge_model`.
* @param [out] machine that used for model inference.
* Example:
* paddle merge_model \
* --model_dir="pass-00000" \
* --model_file="merged_model.paddle"
* @param [out] machine that used for model inference
* @param [in] mergedModel
* @param [in] size
* @return paddle_error
......@@ -97,6 +101,18 @@ paddle_gradient_machine_randomize_param(paddle_gradient_machine machine);
PD_API paddle_error
paddle_gradient_machine_destroy(paddle_gradient_machine machine);
/**
* @brief Get the output of the layer named `layerName`.
* @param [in] gradient machine that have run a inference
* @param [in] layerName name of specified layer
* @param [out] args output of the specified layer
* @return paddle_error
*/
PD_API paddle_error
paddle_gradient_machine_get_layer_output(paddle_gradient_machine machine,
const char* layerName,
paddle_arguments args);
#ifdef __cplusplus
}
#endif
......
......@@ -173,6 +173,96 @@ extern void hl_avgpool_backward(const int frameCnt,
real* backGrad,
const int outStride);
extern void hl_maxpool3D_forward(const int frameCnt,
const real* inputData,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real* tgtData,
real* maxPoolIdxData,
const int tgtStride);
extern void hl_maxpool3D_backward(const int frameCnt,
const real* outGrad,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real scaleA,
real scaleB,
real* targetGrad,
real* maxPoolIdxData,
const int outStride);
extern void hl_avgpool3D_forward(const int frameCnt,
const real* inputData,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real* tgtData,
const int tgtStride);
extern void hl_avgpool3D_backward(const int frameCnt,
const real* outGrad,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
int paddingD,
int paddingH,
int paddingW,
real scaleA,
real scaleB,
real* backGrad,
const int outStride);
/**
* @brief Bilinear interpolation forward.
*
......@@ -275,4 +365,4 @@ extern void hl_maxout_backward(real* inGrad,
size_t featLen,
size_t groups);
#endif /* HL_CNN_H_ */
#endif // HL_CNN_H_
......@@ -18,14 +18,6 @@ limitations under the License. */
#ifndef __NVCC__
#include "paddle/math/MathFunctions.h"
#ifndef PADDLE_TYPE_DOUBLE
#define CBLAS_GEMM paddle::gemm<float>
#else
#define CBLAS_GEMM paddle::gemm<double>
#endif
template<class OpResetOutput>
void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
real *gateValue,
......@@ -210,51 +202,6 @@ inline void forward_final_output(OpFinalOutput opFinalOutput,
}
}
template<class OpResetOutput, class OpFinalOutput>
void hl_cpu_gru_forward(OpResetOutput opResetOutput,
OpFinalOutput opFinalOutput,
hl_gru_value value,
int frameSize,
int batchSize,
hl_activation_mode_t active_node,
hl_activation_mode_t active_gate) {
if (value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans,
CblasNoTrans,
batchSize,
2 * frameSize,
frameSize,
1,
value.prevOutValue,
frameSize,
value.gateWeight,
frameSize * 2,
1,
value.gateValue,
frameSize * 3);
}
forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
if (value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans,
CblasNoTrans,
batchSize,
frameSize,
frameSize,
1,
value.resetOutputValue,
frameSize,
value.stateWeight,
frameSize,
1,
value.gateValue + frameSize * 2,
frameSize * 3);
}
forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
}
template<class OpStateGrad>
void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad,
real *gateValue,
......@@ -525,86 +472,6 @@ inline void backward_reset_grad(OpResetGrad opResetGrad,
}
}
template<class OpStateGrad, class OpResetGrad>
void hl_cpu_gru_backward(OpStateGrad opStateGrad,
OpResetGrad opResetGrad,
hl_gru_value value,
hl_gru_grad grad,
int frameSize,
int batchSize,
hl_activation_mode_t active_node,
hl_activation_mode_t active_gate) {
backward_state_grad(opStateGrad, value, grad,
frameSize, batchSize, active_node);
if (value.prevOutValue && grad.prevOutGrad) {
CBLAS_GEMM(CblasNoTrans,
CblasTrans,
batchSize,
frameSize,
frameSize,
1,
grad.gateGrad + frameSize * 2,
frameSize * 3,
value.stateWeight,
frameSize,
0,
grad.resetOutputGrad,
frameSize);
if (grad.stateWeightGrad) {
CBLAS_GEMM(CblasTrans,
CblasNoTrans,
frameSize,
frameSize,
batchSize,
1,
value.resetOutputValue,
frameSize,
grad.gateGrad + frameSize * 2,
frameSize * 3,
1,
grad.stateWeightGrad,
frameSize);
}
}
backward_reset_grad(opResetGrad, value, grad,
frameSize, batchSize, active_gate);
if (grad.prevOutGrad && value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans,
CblasTrans,
batchSize,
frameSize,
frameSize * 2,
1,
grad.gateGrad,
frameSize * 3,
value.gateWeight,
frameSize * 2,
1,
grad.prevOutGrad,
frameSize);
if (grad.gateWeightGrad) {
CBLAS_GEMM(CblasTrans,
CblasNoTrans,
frameSize,
frameSize * 2,
batchSize,
1,
value.prevOutValue,
frameSize,
grad.gateGrad,
frameSize * 3,
1,
grad.gateWeightGrad,
frameSize * 2);
}
}
}
#endif
#endif // HL_CPU_GRU_CUH_
......@@ -214,7 +214,8 @@ extern void hl_conv_workspace(hl_tensor_descriptor input,
int* convBwdDataAlgo,
size_t* bwdDataLimitBytes,
int* convBwdFilterAlgo,
size_t* bwdFilterLimitBytes);
size_t* bwdFilterLimitBytes,
bool useDilation);
/**
* @brief destroy filter descriptor.
......@@ -242,7 +243,9 @@ extern void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width);
int stride_width,
int dilation_h = 1,
int dilation_w = 1);
/**
* @brief reset convolution descriptor.
......@@ -262,7 +265,9 @@ extern void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width);
int stride_width,
int dilation_h = 1,
int dilation_w = 1);
/**
* @brief destroy convolution descriptor.
......
......@@ -224,4 +224,80 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
extern void hl_matrix_rotate(
real* mat, real* matRot, int dimM, int dimN, bool clockWise);
/**
* @brief Matrix vol2Col: Convert 3D volume into col matrix
*
* @param[in] matSrc input matrix.
* @param[in] channel channel of matSrc.
* @param[in] depth depth of matSrc.
* @param[in] height height of matSrc.
* @param[in] width width of matSrc.
* @param[in] filterD depth of filter.
* @param[in] filterH height of filter.
* @param[in] filterW width of filter.
* @param[in] strideD stride in the depth.
* @param[in] strideH stride in the height.
* @param[in] strideW stride in the width.
* @param[in] paddingD padding in the depth.
* @param[in] paddingH padding in the height.
* @param[in] paddingW padding in the width.
* @param[out] dataDst output matrix.
*
*/
extern void hl_matrix_vol2Col(const real* dataSrc,
int channels,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
real* dataDst);
/**
* @brief Matrix col2Vol: Convert col matrix into 3D volume
*
* @param[out] matDst output matrix.
* @param[in] channel channel of matDst.
* @param[in] depth depth of matDst.
* @param[in] height height of matDst.
* @param[in] width width of matDst.
* @param[in] filterD depth of filter.
* @param[in] filterH height of filter.
* @param[in] filterW width of filter.
* @param[in] strideD stride in the depth.
* @param[in] strideH stride in the height.
* @param[in] strideW stride in the width.
* @param[in] paddingD padding in the depth.
* @param[in] paddingH padding in the height.
* @param[in] paddingW padding in the width.
* @param[in] matSrc input matrix.
* @param[in] beta input
* @param[in] alpha input
*
*/
extern void hl_matrix_col2Vol(real* dataDst,
int channels,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
const real* dataSrc,
real alpha,
real beta);
#endif /* HL_MATRIX_H_ */
......@@ -87,6 +87,96 @@ inline void hl_avgpool_backward(const int frameCnt,
real* backGrad,
const int outStride) {}
inline void hl_maxpool3D_forward(const int frameCnt,
const real* inputData,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real* tgtData,
real* maxPoolIdxData,
const int tgtStride) {}
inline void hl_maxpool3D_backward(const int frameCnt,
const real* outGrad,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real scaleA,
real scaleB,
real* targetGrad,
real* maxPoolIdxData,
const int outStride) {}
inline void hl_avgpool3D_forward(const int frameCnt,
const real* inputData,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real* tgtData,
const int tgtStride) {}
inline void hl_avgpool3D_backward(const int frameCnt,
const real* outGrad,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real scaleA,
real scaleB,
real* backGrad,
const int outStride) {}
inline void hl_bilinear_forward(const real* inData,
const size_t inImgH,
const size_t inImgW,
......
......@@ -78,7 +78,9 @@ inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width) {}
int stride_width,
int dilation_h,
int dilation_w) {}
inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
hl_tensor_descriptor image,
......@@ -86,7 +88,9 @@ inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width) {}
int stride_width,
int dilation_h,
int dilation_w) {}
inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
......@@ -99,7 +103,8 @@ inline void hl_conv_workspace(hl_tensor_descriptor input,
int* convBwdDataAlgo,
size_t* bwdDataLimitBytes,
int* convBwdFilterAlgo,
size_t* bwdFilterLimitBytes) {}
size_t* bwdFilterLimitBytes,
bool useDilation) {}
inline void hl_convolution_forward(hl_tensor_descriptor input,
real* input_data,
......
......@@ -99,4 +99,38 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
inline void hl_matrix_rotate(
real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
inline void hl_matrix_vol2Col(const real* dataSrc,
int channels,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
real* dataDst) {}
inline void hl_matrix_col2Vol(real* dataDst,
int channels,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
const real* dataSrc,
real alpha,
real beta) {}
#endif // HL_MATRIX_STUB_H_
此差异已折叠。
......@@ -201,7 +201,8 @@ void hl_conv_workspace(hl_tensor_descriptor input,
int* convBwdDataAlgo,
size_t* bwdDataLimitBytes,
int* convBwdFilterAlgo,
size_t* bwdFilterLimitBytes) {
size_t* bwdFilterLimitBytes,
bool useDilation) {
#if CUDNN_VERSION >= 4000
CHECK_NOTNULL(input);
......@@ -213,21 +214,60 @@ void hl_conv_workspace(hl_tensor_descriptor input,
size_t memoryLimitBytes =
(1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
// For dilation
int algo = 0;
// cudnn convolution forward configuration
cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
// cudnn convolution backward data configuration
cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnConvolutionDescriptor_t bwd_data_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
// cudnn convolution backward filter configuration
cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
t_resource.cudnn_handle,
fwd_src_desc,
fwd_filter_desc,
fwd_conv_desc,
fwd_dest_desc,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
if (useDilation) {
convFwdAlgo = &algo;
convBwdDataAlgo = &algo;
convBwdFilterAlgo = &algo;
} else {
CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
t_resource.cudnn_handle,
fwd_src_desc,
fwd_filter_desc,
fwd_conv_desc,
fwd_dest_desc,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
t_resource.cudnn_handle,
bwd_data_filter_desc,
bwd_data_diff_desc,
bwd_data_conv_desc,
bwd_data_grad_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
t_resource.cudnn_handle,
bwd_filter_src_desc,
bwd_filter_diff_desc,
bwd_filter_conv_desc,
bwd_filter_grad_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
}
CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
t_resource.cudnn_handle,
......@@ -238,23 +278,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
fwdLimitBytes));
// cudnn convolution backward data configuration
cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnConvolutionDescriptor_t bwd_data_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
t_resource.cudnn_handle,
bwd_data_filter_desc,
bwd_data_diff_desc,
bwd_data_conv_desc,
bwd_data_grad_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
t_resource.cudnn_handle,
bwd_data_filter_desc,
......@@ -264,23 +287,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
bwdDataLimitBytes));
// cudnn convolution backward filter configuration
cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
t_resource.cudnn_handle,
bwd_filter_src_desc,
bwd_filter_diff_desc,
bwd_filter_conv_desc,
bwd_filter_grad_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
t_resource.cudnn_handle,
bwd_filter_src_desc,
......@@ -603,7 +609,9 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width) {
int stride_width,
int dilation_h,
int dilation_w) {
CHECK_NOTNULL(conv);
cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
......@@ -625,18 +633,24 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
padding_width,
stride_height,
stride_width,
1,
1,
dilation_h,
dilation_w,
mode,
data_type));
#else
if (dilation_h > 1 || dilation_w > 1) {
LOG(FATAL)
<< "Current cuDNN version does't support for dilation convolution. "
<< "The dilation convolution requires cuDNN >= v6.0.";
}
CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
padding_height,
padding_width,
stride_height,
stride_width,
1,
1,
dilation_h,
dilation_w,
mode));
#endif
......@@ -659,7 +673,9 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_height,
int padding_width,
int stride_height,
int stride_width) {
int stride_width,
int dilation_h,
int dilation_w) {
CHECK_NOTNULL(conv);
CHECK_NOTNULL(image);
CHECK_NOTNULL(filter);
......@@ -678,8 +694,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
padding_width,
stride_height,
stride_width,
1,
1,
dilation_h,
dilation_w,
mode,
data_type));
#else
......@@ -688,8 +704,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
padding_width,
stride_height,
stride_width,
1,
1,
dilation_h,
dilation_w,
mode));
#endif
......
......@@ -592,3 +592,204 @@ void hl_matrix_rotate(
mat, matRot, dimM, dimN, clockWise);
CHECK_SYNC("hl_matrix_rotate failed");
}
__global__ void keMatrixVol2Col(int num_kernels,
const real* dataSrc,
real* dataDst,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
int depth_col,
int height_col,
int width_col) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
index += blockDim.x * gridDim.x) {
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int d_out = (index / width_col / height_col) % depth_col;
int channel_in = index / width_col / height_col / depth_col;
int channel_out = channel_in * filterD * filterH * filterW;
int w_in = w_out * strideW - paddingW;
int h_in = h_out * strideH - paddingH;
int d_in = d_out * strideD - paddingD;
dataDst +=
((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
w_out;
dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
for (int k = 0; k < filterD; ++k) {
for (int i = 0; i < filterH; ++i) {
for (int j = 0; j < filterW; ++j) {
int d = d_in + k;
int h = h_in + i;
int w = w_in + j;
*dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
w < width)
? dataSrc[(k * height + i) * width + j]
: 0;
dataDst += depth_col * height_col * width_col;
}
}
}
}
}
void hl_matrix_vol2Col(const real* dataSrc,
int channels,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
real* dataDst) {
int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
int num_kernels = channels * depth_col * height_col * width_col;
const int threads = 512;
const int blocks = DIVUP(num_kernels, threads);
keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
dataSrc,
dataDst,
depth,
height,
width,
filterD,
filterH,
filterW,
strideD,
strideH,
strideW,
paddingD,
paddingH,
paddingW,
depth_col,
height_col,
width_col);
CHECK_SYNC("hl_matrix_vol2Col failed");
}
__global__ void keMatrixCol2Vol(int num_kernels,
real* dataDst,
const real* dataSrc,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
int depth_col,
int height_col,
int width_col,
real alpha,
real beta) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
index += blockDim.x * gridDim.x) {
real srcVal = 0;
real dstVal = dataDst[index];
int w = index % width + paddingW;
int h = (index / width) % height + paddingH;
int d = (index / width / height) % depth + paddingD;
int c = index / width / height / depth;
// compute the start and end of the output
int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
int w_col_end = min(w / strideW + 1, width_col);
int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
int h_col_end = min(h / strideH + 1, height_col);
int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
int d_col_end = min(d / strideD + 1, depth_col);
int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
h * filterW + w) *
depth_col * height_col * width_col;
int coeff_d_col =
(1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
int coeff_h_col =
(1 - strideH * filterW * depth_col * height_col) * width_col;
int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
w_col * coeff_w_col];
}
}
}
dataDst[index] = alpha * srcVal + beta * dstVal;
}
}
void hl_matrix_col2Vol(real* dataDst,
int channels,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
const real* dataSrc,
real alpha,
real beta) {
int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
int num_kernels = channels * depth * height * width;
const int threads = 512;
const int blocks = DIVUP(num_kernels, threads);
keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
dataDst,
dataSrc,
depth,
height,
width,
filterD,
filterH,
filterW,
strideD,
strideH,
strideW,
paddingD,
paddingH,
paddingW,
depth_col,
height_col,
width_col,
alpha,
beta);
CHECK_SYNC("hl_matrix_col2Vol failed");
}
......@@ -15,23 +15,19 @@ cc_test(variable_test SRCS variable_test.cc)
cc_library(scope SRCS scope.cc)
cc_test(scope_test SRCS scope_test.cc DEPS scope)
proto_library(attribute_proto SRCS attribute.proto)
proto_library(op_proto SRCS op_proto.proto DEPS attribute_proto)
proto_library(op_desc SRCS op_desc.proto DEPS attribute_proto)
cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
proto_library(framework_proto SRCS framework.proto)
cc_library(attribute SRCS attribute.cc DEPS op_desc op_proto)
cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope attribute)
cc_library(attribute SRCS attribute.cc DEPS framework_proto)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder)
cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder)
cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
py_proto_compile(framework_py_proto SRCS attribute.proto op_proto.proto op_desc.proto)
py_proto_compile(framework_py_proto SRCS framework.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init)
......@@ -42,22 +38,4 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward)
if(WITH_PYTHON)
cc_library(paddle_pybind SHARED
SRCS pybind.cc
DEPS pybind python backward
sgd_op
add_op
mul_op
rowwise_add_op
sigmoid_op
softmax_op
mean_op
cross_entropy_op
recurrent_op
uniform_random_op
gaussian_random_op
fill_zeros_like_op)
endif(WITH_PYTHON)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
......@@ -43,8 +43,12 @@ template <>
AttrType AttrTypeID<std::vector<std::string>>() {
return STRINGS;
}
template <>
AttrType AttrTypeID<std::vector<std::pair<int, int>>>() {
return INT_PAIRS;
}
Attribute GetAttrValue(const AttrDesc& attr_desc) {
Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
switch (attr_desc.type()) {
case paddle::framework::AttrType::INT: {
return attr_desc.i();
......@@ -76,6 +80,14 @@ Attribute GetAttrValue(const AttrDesc& attr_desc) {
}
return val;
}
case paddle::framework::AttrType::INT_PAIRS: {
std::vector<std::pair<int, int>> val(attr_desc.int_pairs_size());
for (int i = 0; i < attr_desc.int_pairs_size(); ++i) {
val[i].first = attr_desc.int_pairs(i).first();
val[i].second = attr_desc.int_pairs(i).second();
}
return val;
}
}
PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
return boost::blank();
......
......@@ -20,8 +20,7 @@ limitations under the License. */
#include <unordered_set>
#include <vector>
#include "paddle/framework/attribute.pb.h"
#include "paddle/framework/op_desc.pb.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/variant.h"
......@@ -29,7 +28,8 @@ namespace paddle {
namespace framework {
typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
std::vector<float>, std::vector<std::string>>
std::vector<float>, std::vector<std::string>,
std::vector<std::pair<int, int>>>
Attribute;
typedef std::unordered_map<std::string, Attribute> AttributeMap;
......@@ -37,15 +37,27 @@ typedef std::unordered_map<std::string, Attribute> AttributeMap;
template <typename T>
AttrType AttrTypeID();
Attribute GetAttrValue(const AttrDesc& attr_desc);
Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
// check whether a value(attribute) fit a certain limit
template <typename T>
class LargerThanChecker {
class GreaterThanChecker {
public:
explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const {
PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
}
private:
T lower_bound_;
};
template <typename T>
class EqualGreaterThanChecker {
public:
explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const {
PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
}
private:
......@@ -110,8 +122,13 @@ class TypedAttrChecker {
return *this;
}
TypedAttrChecker& LargerThan(const T& lower_bound) {
value_checkers_.push_back(LargerThanChecker<T>(lower_bound));
TypedAttrChecker& GreaterThan(const T& lower_bound) {
value_checkers_.push_back(GreaterThanChecker<T>(lower_bound));
return *this;
}
TypedAttrChecker& EqualGreaterThan(const T& lower_bound) {
value_checkers_.push_back(EqualGreaterThanChecker<T>(lower_bound));
return *this;
}
......
......@@ -15,31 +15,44 @@
#include "paddle/framework/backward.h"
#include <list>
#include <memory>
#include "paddle/framework/op_registry.h"
#include "paddle/operators/net_op.h"
#include "paddle/operators/recurrent_op.h"
namespace paddle {
namespace framework {
static bool AllInSet(const std::vector<std::string>& names,
const std::string& suffix,
const std::unordered_set<std::string>& set) {
template <typename Map, typename T>
static void ForEachVarName(const Map& names, T callback) {
for (auto& name : names) {
if (set.find(name + suffix) == set.end()) {
return false;
for (auto& n : name.second) {
if (callback(n)) return;
}
}
return true;
}
static std::shared_ptr<OperatorBase> NOP() {
auto net_op = std::make_shared<operators::NetOp>();
net_op->type_ = "@NOP@";
// return whether all the names + suffixes in the set
static bool AllInSet(
const std::map<std::string, std::vector<std::string>>& names,
const std::string& suffix, const std::unordered_set<std::string>& set) {
bool all_in_set = true;
ForEachVarName(names, [&all_in_set, &set, &suffix](const std::string& n) {
all_in_set = set.find(n + suffix) != set.end();
return !all_in_set;
});
return all_in_set;
}
static std::unique_ptr<OperatorBase> NOP() {
auto net_op = new operators::NetOp();
net_op->SetType("@NOP@");
net_op->CompleteAddOp();
return net_op;
return std::unique_ptr<OperatorBase>(net_op);
}
// Get backward operator from a forward operator, recursively implementation.
// Get backward operator from a forward operator, a recursive implementation.
//
// no_grad_names the gradient variable names without gradient calculating.
//
......@@ -47,126 +60,159 @@ static std::shared_ptr<OperatorBase> NOP() {
// BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
// pass `uniq_id` through recursive calling.
//
// returns The backward operator. For simple situation, it is a simple
// operator. For complex situation, it is a NetOp.
// returns The backward operator. In a simple situation, it may be a simple
// operator, in a complex situation, it maybe a NetOp.
//
// See Backward.h for details
static std::shared_ptr<OperatorBase> BackwardRecursive(
const OperatorBase& forwardOp,
std::unordered_set<std::string>& no_grad_names, size_t& uniq_id);
std::shared_ptr<OperatorBase> BackwardRecursive(
static std::unique_ptr<OperatorBase> BackwardRecursive(
const OperatorBase& forwardOp,
std::unordered_set<std::string>& no_grad_names, size_t& uniq_id) {
// If all input gradients of forwarding operator do not need to calculate,
// just return an NOP. Not return null ptr because NOP does not take
// too much time for calculation, but it is useful for simplifying logic.
if (AllInSet(forwardOp.inputs_, kGradVarSuffix, no_grad_names)) {
if (AllInSet(forwardOp.Inputs() /*names*/, kGradVarSuffix /*suffix*/,
no_grad_names /*set*/)) {
return NOP();
}
// All output gradients of forwarding operator do not need to calculate.
// Then all input gradients cannot be computed at all, and we put them into
// `no_grad_names` set. Return an NOP.
if (AllInSet(forwardOp.outputs_, kGradVarSuffix, no_grad_names)) {
for (auto& name : forwardOp.inputs_) {
// Mark all input is not need
no_grad_names.insert(name + kGradVarSuffix);
}
if (AllInSet(forwardOp.Outputs() /*names*/, kGradVarSuffix /*suffix*/,
no_grad_names /*set*/)) {
ForEachVarName(forwardOp.Inputs(),
[&no_grad_names](const std::string& name) -> bool {
no_grad_names.insert(GradVarName(name));
return false;
});
return NOP();
}
// Returned gradient network
auto net = std::make_shared<operators::NetOp>();
auto net = std::unique_ptr<operators::NetOp>(new operators::NetOp());
if (forwardOp.IsNetOp()) {
// Because forwardOp is a net op, it can static_cast.
auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);
// Map from output gradient variable name to operator's indices in
// backward net. That operator generates that variable.
// backward net's ops_. That operator generates that variable.
std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
size_t local_op_id = 0;
// reversely travel forwardNet
// reversely travel forwardNet and collect all duplicate outputs.
for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
++it, ++local_op_id) {
auto fwd = *it;
auto& fwd = *it;
auto bwd = BackwardRecursive(*fwd, no_grad_names, uniq_id);
net->AddOp(bwd);
for (auto& out : bwd->outputs_) {
dup_output_ops[out].emplace_back(local_op_id);
}
ForEachVarName(bwd->Outputs(),
[&dup_output_ops, local_op_id](const std::string& out) {
dup_output_ops[out].emplace_back(local_op_id);
return false;
});
net->AppendOp(std::move(bwd));
}
// Get unique ID for this method.
auto uid = uniq_id++;
// TODO(dzh): more comment
using Pos = std::pair<size_t, std::shared_ptr<OperatorBase>>;
// multiple operators which have the same output (y for example) may
// overwrite the same y variable when backward, special operations are token
// to handle this case. For each duplicate output, rename it to an alias
// (original name with a offset), append an `add` op for its operator,
// and finally sum all the alias variable to the final output variable y.
using Pos = std::pair<size_t, std::unique_ptr<OperatorBase>>;
std::list<Pos> insert_position;
for (auto& dup_output_op : dup_output_ops) {
const std::string& name = dup_output_op.first;
// duplicate @Empty@ don't need to be added
if (name == kEmptyVarName) continue;
auto& dup_op = dup_output_op.second;
// no duplicate output
if (dup_op.size() == 1) continue;
std::vector<std::string> dup_outputs;
// process the duplicate outputs
std::vector<std::string> dup_outputs;
for (size_t i = 0; i < dup_op.size(); ++i) {
// rename each duplicate output to an alias
auto op_offset = dup_op[i];
dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
std::to_string(i));
net->ops_[op_offset]->Rename(name, dup_outputs.back());
}
// collect all the offset to append `add` op for each alias
insert_position.push_back(
{dup_op.back(),
OpRegistry::CreateOp(
"add", {dup_outputs}, {name},
{{"input_format",
std::vector<int>{0, static_cast<int>(dup_outputs.size())}}})});
{dup_op.back(), OpRegistry::CreateOp("add", {{"X", {dup_outputs}}},
{{"Out", {name}}}, {})});
}
// make sure the inserted `add` ops follow the BFS order.
insert_position.sort(
[](const Pos& l, const Pos& r) { return l.first > r.first; });
for (auto& pos : insert_position) {
net->InsertOp(pos.first + 1, pos.second);
net->InsertOp(pos.first + 1, std::move(pos.second));
}
} else {
std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
for (std::string& grad_input : grad_op->inputs_) {
std::unique_ptr<OperatorBase> grad_op(OpRegistry::CreateGradOp(forwardOp));
ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
const std::string& grad_input) {
if (no_grad_names.count(grad_input)) {
// +1 for \0
std::string prefix = grad_input.substr(
0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
grad_input = prefix + kZeroVarSuffix;
grad_op->Rename(grad_input, prefix + kZeroVarSuffix);
// If part of input gradient of that operator is not calculated, fill
// zero variables to that input gradient.
net->AddOp(OpRegistry::CreateOp("fill_zeros_like", {prefix},
{grad_input}, {}));
}
}
for (std::string& grad_output : grad_op->outputs_) {
if (no_grad_names.count(grad_output)) {
grad_output = kEmptyVarName;
net->AppendOp(OpRegistry::CreateOp("fill_zeros_like",
{{"Src", {prefix}}},
{{"Dst", {grad_input}}}, {}));
}
return false;
});
ForEachVarName(grad_op->Outputs(),
[&no_grad_names, &grad_op](const std::string& grad_output) {
if (no_grad_names.count(grad_output)) {
grad_op->Rename(grad_output, kEmptyVarName);
}
return false;
});
// process recurrent gradient op as a special operator.
if (forwardOp.Type() == "recurrent") {
// NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or
// this will result in infinite loop.
const auto& rnnop =
*static_cast<const operators::RecurrentOp*>(&forwardOp);
auto rnn_grad_op =
static_cast<operators::RecurrentGradientOp*>(grad_op.get());
const auto& stepnet_op =
*static_cast<const OperatorBase*>(&rnnop.stepnet());
// create stepnet's gradient op
rnn_grad_op->set_stepnet(
BackwardRecursive(stepnet_op, no_grad_names, uniq_id));
}
if (net->ops_.empty()) { // Current no aux op is added to network
return grad_op;
}
net->AddOp(grad_op);
net->AppendOp(std::move(grad_op));
}
net->type_ = "@GENERATED_BACKWARD@";
net->SetType("@GENERATED_BACKWARD@");
net->CompleteAddOp();
return net;
return std::unique_ptr<OperatorBase>(
static_cast<OperatorBase*>(net.release()));
}
// See header for comments
std::shared_ptr<OperatorBase> Backward(
std::unique_ptr<OperatorBase> Backward(
const OperatorBase& forwardOp,
const std::unordered_set<std::string>& no_grad_vars) {
std::unordered_set<std::string> no_grad_names;
no_grad_names.reserve(no_grad_vars.size());
no_grad_names.reserve(no_grad_vars.size() + 1);
no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
......
......@@ -20,7 +20,7 @@ namespace framework {
// Create the backward operator from a forward operator.
// TODO(yuyang18): Add more API reference comment.
extern std::shared_ptr<OperatorBase> Backward(
extern std::unique_ptr<OperatorBase> Backward(
const OperatorBase& forwardOp,
const std::unordered_set<std::string>& no_grad_vars);
} // namespace framework
......
## Operator/expression 's Backward
# Operator/expression 's Backward
### Motivation
## Motivation
In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation lineage, the operator/ expression's Backward feature will generate the backward pass respect to forward pass.
In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the gradient operators/expressions together with the chain rule. Every forward network needs a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
### Implement : gradient operator registry
## Backward Operator Registry
| | forward operator | backward operator |
| ---------------------- | ---------------- | -------------------------------- |
| **Operator::inputs_** | Inputs | Inputs, Outputs, OutputGradients |
| **Operator::outputs_** | Outputs | InputGradients |
A backward network is built up with several backward operators. Backward operators take forward operators' inputs outputs, and output gradients and then calculate its input gradients.
Inputs/Outputs means the input/output of the operator, InputGradients/OutputGradients is the gradient respect to forward opeartor. Forward operator and Backward operator are isomorphic, save their corresponding needs into member attribute.
| | forward operator | backward operator
| ---------------------- | ---------------- |------------------------- |
| **Operator::inputs_** | Inputs | Inputs, Outputs, OutputGradients |
| **Operator::outputs_** | Outputs | InputGradients |
We use a global hash map record the gradient operators available, follow the philosophy of minimum core, make operator pluggable unit. Each gradient is an operator and it needs to regist itself.
In most cases, there is a one-to-one correspondence between the forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
grad_op_builder(fengjiayi)
For example, we have got a `mul_op`, and we can register its information and corresponding backward operator by the following macro:
### Implement : Backward network
```cpp
REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
```
given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
`mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.
1. bla bla bla (yuyang)
`mul_grad` is the type of backward operator, and `MulOpGrad` is its class name.
## Backward Opeartor Creating
Given a certain forward operator, we can get its corresponding backward operator by calling:
```cpp
OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
```
The function `BuildGradOp` will sequentially execute following processes:
1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing.
3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
4. Building backward operator with `inputs`, `outputs` and forward operator's attributes.
## Backward Network Building
A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and put them together.
In our design, the network itself is also a kind of operator. So the operators contained by a big network may be some small network.
given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`, `InputGradients`.
1. Op
when the input forward network is an Op, return its gradient Operator Immediately.
2. NetOp
when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively and ensure them done. During the process, we need to collect the `OutputGradients` name.
when the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp.
**shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwrite their shared input variable.
<p align="center">
<img src="./images/duplicate_op.png" width="50%" ><br/>
1. Shared variable in operators.
</p>
Share variable between operators or same input variable used in multiple operators leads to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively and add a generic add operator replace the overwrite links.
<p align="center">
<img src="images/duplicate_op2.png" width="50%" ><br/>
We share variable in the same scope, as a result, duplicate operator `OutputGradients` will overwirte then duplicate variable.
2. Replace shared variable's gradient with `Add` operator.
![./images/duplicate_op]()
</p>
Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator instead.
![./images/duplicate_op2]()
​ Then collect the sub graph OutputGradients/InputGradients as the NetOp's and return it.
​ Then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
......@@ -28,21 +28,13 @@ using OpAttrChecker = framework::OpAttrChecker;
using Scope = framework::Scope;
using DeviceContext = platform::DeviceContext;
class EmptyOp : public OperatorBase {
public:
DEFINE_OPERATOR_CTOR(EmptyOp, OperatorBase)
void InferShape(const Scope &scope) const override {}
void Run(const Scope &scope, const DeviceContext &dev_ctx) const override {}
};
class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
public:
RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input X of Add").IgnoreGradient();
AddInput("b", "Bias of Add").IgnoreGradient();
AddOutput("Out", "Out of Add").IgnoreGradient();
AddInput("X", "Input X of Add").NotInGradient();
AddInput("b", "Bias of Add").NotInGradient();
AddOutput("Out", "Out of Add").NotInGradient();
AddComment("Add Op");
}
};
......@@ -51,8 +43,8 @@ class MulOpMaker : public OpProtoAndCheckerMaker {
public:
MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("A", "A");
AddInput("B", "B");
AddInput("X", "A");
AddInput("Y", "B");
AddOutput("Out", "Out");
AddComment("Mul");
}
......@@ -63,7 +55,7 @@ class SigmoidOpMaker : public OpProtoAndCheckerMaker {
SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "X");
AddOutput("Y", "Y");
AddOutput("Out", "Y");
AddComment("Sigmoid");
}
};
......@@ -73,21 +65,25 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker {
NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "X input");
AddOutput("Y", "Y output");
AddOutput("Out", "Y output");
AddComment("NoGradOp, same input output. no Grad");
}
};
class FcOp : public operators::NetOp {
public:
void Init() override {
AddOp(OpRegistry::CreateOp("mul", {Input("X"), Input("W")},
{Output("mul_result")}, {}));
auto b_name = Input("b");
FcOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
AppendOp(OpRegistry::CreateOp("mul",
{{"X", {Input("X")}}, {"Y", {Input("W")}}},
{{"Out", {Output("mul_result")}}}, {}));
auto input_b = Inputs("b");
std::string before_act = "mul_result";
if (b_name != kEmptyVarName) {
AddOp(OpRegistry::CreateOp("rowwise_add", {Output("mul_result"), b_name},
{Output("add_result")}, {}));
if (input_b.size() != 0) {
AppendOp(OpRegistry::CreateOp(
"rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
{{"Out", {Output("add_result")}}}, {}));
before_act = "add_result";
} else {
auto out_varname = Output("add_result");
......@@ -96,8 +92,8 @@ class FcOp : public operators::NetOp {
}
}
AddOp(OpRegistry::CreateOp("sigmoid", {Output(before_act)}, {Output("Out")},
{}));
AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
{{"Out", {Output("Out")}}}, {}));
CompleteAddOp(false);
}
};
......@@ -109,8 +105,8 @@ class FcOpMaker : public OpProtoAndCheckerMaker {
AddInput("X", "x");
AddInput("W", "w");
AddInput("b", "b");
AddOutput("mul_result", "").SetTemporary();
AddOutput("add_result", "").SetTemporary();
AddOutput("mul_result", "").AsIntermediate();
AddOutput("add_result", "").AsIntermediate();
AddOutput("Out", "");
AddComment("");
}
......@@ -131,8 +127,8 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
public:
FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("x", "x");
AddOutput("out", "out");
AddInput("Src", "x");
AddOutput("Dst", "out");
AddComment("");
}
};
......@@ -141,8 +137,8 @@ class AddOpMaker : public OpProtoAndCheckerMaker {
public:
AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "x").SetMultiple();
AddOutput("Y", "y");
AddInput("X", "x").AsDuplicable();
AddOutput("Out", "out");
AddComment("");
}
};
......@@ -152,51 +148,48 @@ class AddOpMaker : public OpProtoAndCheckerMaker {
namespace f = paddle::framework;
namespace ops = paddle::operators;
using EnforceNotMet = paddle::platform::EnforceNotMet;
REGISTER_OP(rowwise_add, f::EmptyOp, f::RowWiseAddOpMaker);
REGISTER_GRADIENT_OP(rowwise_add, rowwise_add_grad, f::EmptyOp);
REGISTER_OP(mul, f::EmptyOp, f::MulOpMaker);
REGISTER_GRADIENT_OP(mul, mul_grad, f::EmptyOp);
REGISTER_OP(sigmoid, f::EmptyOp, f::SigmoidOpMaker);
REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, f::EmptyOp);
REGISTER_OP(nograd, f::EmptyOp, f::NoGradOpMaker);
REGISTER_OP(fill_zeros_like, f::EmptyOp, f::FillZeroOpMaker);
REGISTER_OP(add, f::EmptyOp, f::AddOpMaker);
REGISTER_GRADIENT_OP(add, add_grad, f::EmptyOp);
REGISTER_OP(fc, f::FcOp, f::FcOpMaker);
REGISTER_OP(many_output_op, f::EmptyOp, f::ManyOutputOpMaker);
REGISTER_GRADIENT_OP(many_output_op, many_output_op_grad, f::EmptyOp);
REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, rowwise_add_grad,
f::NOP);
REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP);
REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP);
REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker);
REGISTER_OP(add, f::NOP, f::AddOpMaker, add_grad, f::NOP);
REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
f::NOP);
TEST(Backward, simple_op_grad) {
auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
auto fwd = f::OpRegistry::CreateOp(
"rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
ASSERT_NE(fwd, nullptr);
auto gop = f::OpRegistry::CreateGradOp(*fwd);
ASSERT_EQ(4UL, gop->inputs_.size());
ASSERT_EQ(f::kEmptyVarName, gop->inputs_[0]);
ASSERT_EQ("rowwise_add_grad", gop->type_);
ASSERT_EQ(f::GradVarName("X"), gop->outputs_[0]);
ASSERT_EQ(f::GradVarName("b"), gop->outputs_[1]);
ASSERT_EQ(f::GradVarName("X"), gop->Output(f::GradVarName("X")));
ASSERT_EQ(1UL, gop->Inputs().size());
ASSERT_EQ("rowwise_add_grad", gop->Type());
ASSERT_EQ(f::GradVarName("x"), gop->Output(f::GradVarName("X")));
ASSERT_EQ(f::GradVarName("b"), gop->Output(f::GradVarName("b")));
}
TEST(Backward, simple_op_not_need_grad) {
auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
auto fwd = f::OpRegistry::CreateOp(
"rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
ASSERT_NE(fwd, nullptr);
auto gop = f::Backward(*fwd, {"X"});
ASSERT_EQ(std::find(gop->outputs_.begin(), gop->outputs_.end(),
f::GradVarName("X")),
gop->outputs_.end());
auto gop = f::Backward(*fwd, {"x"});
ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
auto no_input_gop = f::Backward(*fwd, {"X", "b"});
auto no_input_gop = f::Backward(*fwd, {"x", "b"});
ASSERT_NE(no_input_gop, nullptr);
ASSERT_TRUE(no_input_gop->IsNetOp());
ASSERT_EQ(0UL,
std::static_pointer_cast<ops::NetOp>(no_input_gop)->ops_.size());
ASSERT_EQ(0UL, static_cast<ops::NetOp *>(no_input_gop.get())->ops_.size());
}
TEST(Backward, net_fc_backward_normal) {
std::shared_ptr<f::OperatorBase> fwd = f::OpRegistry::CreateOp(
"fc", {"X", "w", "b"}, {"mul_result", "add_result", "out"}, {});
std::shared_ptr<f::OperatorBase> fwd =
f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
{{"mul_result", {"mul_res"}},
{"add_result", {"add_re"}},
{"Out", {"out"}}},
{});
ASSERT_NE(fwd, nullptr);
std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
ASSERT_TRUE(gop->IsNetOp());
......@@ -207,19 +200,22 @@ TEST(Backward, net_fc_backward_normal) {
ASSERT_EQ(3UL, net->ops_.size());
f::OperatorBase &d_sigmoid = *net->ops_[0];
ASSERT_EQ("sigmoid_grad", d_sigmoid.type_);
ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
f::OperatorBase &d_add = *net->ops_[1];
ASSERT_EQ("rowwise_add_grad", d_add.type_);
ASSERT_EQ("rowwise_add_grad", d_add.Type());
f::OperatorBase &d_mul = *net->ops_[2];
ASSERT_EQ("mul_grad", d_mul.type_);
ASSERT_EQ("mul_grad", d_mul.Type());
}
TEST(Backward, net_fc_backward_not_have_b) {
std::shared_ptr<f::OperatorBase> fwd =
f::OpRegistry::CreateOp("fc", {"X", "w", f::kEmptyVarName},
{"mul_result", "add_result", "tmp"}, {});
f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {}}},
{{"mul_result", {"mul_res"}},
{"add_result", {"add_res"}},
{"Out", {"tmp"}}},
{});
ASSERT_NE(fwd, nullptr);
std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
ASSERT_TRUE(gop->IsNetOp());
......@@ -230,96 +226,113 @@ TEST(Backward, net_fc_backward_not_have_b) {
ASSERT_EQ(2UL, net->ops_.size());
f::OperatorBase &d_sigmoid = *net->ops_[0];
ASSERT_EQ("sigmoid_grad", d_sigmoid.type_);
ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
f::OperatorBase &d_mul = *net->ops_[1];
ASSERT_EQ("mul_grad", d_mul.type_);
ASSERT_EQ("mul_grad", d_mul.Type());
}
TEST(Backward, net_input_of_network_not_need_grad) {
ops::NetOp net;
net.AddOp(f::OpRegistry::CreateOp("fc", {"X", "W1", "b1"},
{"mul_tmp_0", "add_tmp_0", "hidden0"}, {}));
net.AddOp(f::OpRegistry::CreateOp("fc", {"hidden0", "W2", "b2"},
{"mul_tmp_1", "add_tmp_1", "hidden1"}, {}));
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
{{"mul_result", {"mul_tmp_0"}},
{"add_result", {"add_tmp_0"}},
{"Out", {"hidden0"}}},
{}));
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
{{"mul_result", {"mul_tmp_1"}},
{"add_result", {"add_tmp_1"}},
{"Out", {"hidden1"}}},
{}));
net.CompleteAddOp();
auto bwd = Backward(net, {"X"}); // X@GRAD is not need.
auto bwd = Backward(net, {"x"}); // x@GRAD is not need.
ASSERT_TRUE(bwd->IsNetOp());
auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
std::unordered_set<std::string> all_output = std::unordered_set<std::string>(
bwd_net->outputs_.begin(), bwd_net->outputs_.end());
all_output.erase(f::kEmptyVarName);
auto output_vars = bwd_net->OutputVars(true);
std::unordered_set<std::string> all_outputs =
std::unordered_set<std::string>(output_vars.begin(), output_vars.end());
all_outputs.erase(f::kEmptyVarName);
for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
ASSERT_NE(all_output.find(f::GradVarName(out)), all_output.end());
ASSERT_NE(all_outputs.find(f::GradVarName(out)), all_outputs.end());
}
// Not Generated X
ASSERT_EQ(all_output.find(f::GradVarName("X")), all_output.end());
ASSERT_EQ(all_outputs.find(f::GradVarName("X")), all_outputs.end());
ASSERT_EQ(2UL, bwd_net->ops_.size());
ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
ASSERT_EQ(3UL, first_fc_grad->ops_.size());
ASSERT_EQ(f::kEmptyVarName,
first_fc_grad->ops_[2]->Output(f::GradVarName("A")));
first_fc_grad->ops_[2]->Output(f::GradVarName("X")));
}
TEST(Backward, net_shared_weight) {
ops::NetOp net;
net.AddOp(f::OpRegistry::CreateOp("mul", {"X", "W"}, {"Out"}, {}));
net.AddOp(f::OpRegistry::CreateOp("mul", {"Out", "W"}, {"FinalOut"}, {}));
net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
{{"Out", {"out"}}}, {}));
net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
{{"Out", {"FinalOut"}}}, {}));
net.CompleteAddOp();
auto bwd = f::Backward(net, {});
ASSERT_TRUE(bwd->IsNetOp());
auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
ASSERT_EQ(3UL, bwd_net->ops_.size());
ASSERT_EQ("add", bwd_net->ops_[2]->type_);
ASSERT_EQ("add", bwd_net->ops_[2]->Type());
}
TEST(Backward, op_register_grad_not_for_network) {
auto fwd = f::OpRegistry::CreateOp(
"fc", {"X", "W", "b"}, {"mul_out", "add_out", "out1"},
{{"temporary_index", std::vector<int>{0, 1}}});
auto fwd =
f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
{{"mul_result", {"mul_out"}},
{"add_result", {"add_out"}},
{"Out", {"out1"}}},
{{"temporary_index", std::vector<int>{0, 1}}});
ASSERT_THROW(f::OpRegistry::CreateGradOp(*fwd), EnforceNotMet);
}
TEST(Backward, op_all_input_are_not_need) {
auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
auto backward = f::Backward(*fwd, {"X", "b"});
auto fwd = f::OpRegistry::CreateOp(
"rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
auto backward = f::Backward(*fwd, {"x", "b"});
ASSERT_TRUE(backward->IsNetOp());
auto net = static_cast<ops::NetOp *>(backward.get());
ASSERT_TRUE(net->ops_.empty());
}
TEST(Backward, op_all_output_are_not_need) {
auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
auto backward = f::Backward(*fwd, {"Out"});
auto fwd = f::OpRegistry::CreateOp(
"rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
auto backward = f::Backward(*fwd, {"out"});
ASSERT_TRUE(backward->IsNetOp());
auto net = static_cast<ops::NetOp *>(backward.get());
ASSERT_TRUE(net->ops_.empty());
}
TEST(Backward, op_part_of_output_are_not_need) {
auto fwd = f::OpRegistry::CreateOp("many_output_op", {"X"}, {"Y", "Z"}, {});
auto fwd = f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
{{"y", {"Y"}}, {"z", {"Z"}}}, {});
auto backward = f::Backward(*fwd, {"Z"});
ASSERT_TRUE(backward->IsNetOp());
auto net = static_cast<ops::NetOp *>(backward.get());
ASSERT_EQ(net->ops_.size(), 2UL);
auto &fill_zero = *net->ops_[0];
ASSERT_EQ("fill_zeros_like", fill_zero.type_);
ASSERT_EQ(1UL, fill_zero.inputs_.size());
ASSERT_EQ("Z", fill_zero.inputs_[0]);
ASSERT_EQ(1UL, fill_zero.outputs_.size());
ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.outputs_[0]);
ASSERT_EQ("fill_zeros_like", fill_zero.Type());
ASSERT_EQ(1UL, fill_zero.Inputs("Src").size());
ASSERT_EQ("Z", fill_zero.Input("Src"));
ASSERT_EQ(1UL, fill_zero.Outputs("Dst").size());
ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Dst"));
auto &d_many_out = *net->ops_[1];
ASSERT_EQ("many_output_op_grad", d_many_out.type_);
ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.inputs_.size()); // I/O/OG
ASSERT_EQ("many_output_op_grad", d_many_out.Type());
ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.Inputs().size()); // I/O/OG
ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
d_many_out.Input(f::GradVarName("z")));
ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
......@@ -327,44 +340,62 @@ TEST(Backward, op_part_of_output_are_not_need) {
}
TEST(Backward, op_part_of_input_are_not_need) {
auto fwd = f::OpRegistry::CreateOp("mul", {"a", "b"}, {"out"}, {});
auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
{{"Out", {"out"}}}, {});
auto backward = f::Backward(*fwd, {"a"});
auto &grad_mul = *backward;
ASSERT_EQ(grad_mul.type_, "mul_grad");
ASSERT_EQ(grad_mul.inputs_.size(), 2UL + 1UL + 1UL);
ASSERT_EQ(grad_mul.outputs_.size(), 2UL);
ASSERT_EQ(grad_mul.Output(f::GradVarName("A")), f::kEmptyVarName);
ASSERT_EQ(grad_mul.Output(f::GradVarName("B")), f::GradVarName("b"));
ASSERT_EQ(grad_mul.Type(), "mul_grad");
ASSERT_EQ(grad_mul.Inputs().size(), 2UL + 1UL + 1UL);
ASSERT_EQ(grad_mul.Outputs().size(), 2UL);
ASSERT_EQ(grad_mul.Output(f::GradVarName("X")), f::kEmptyVarName);
ASSERT_EQ(grad_mul.Output(f::GradVarName("Y")), f::GradVarName("b"));
ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
ASSERT_EQ(grad_mul.Input("A"), "a");
ASSERT_EQ(grad_mul.Input("B"), "b");
ASSERT_EQ(grad_mul.Input("X"), "a");
ASSERT_EQ(grad_mul.Input("Y"), "b");
ASSERT_EQ(grad_mul.Input("Out"), "out");
}
TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
ops::NetOp net;
net.AddOp(f::OpRegistry::CreateOp("fc", {"x1", "w1", "b1"},
{"mul_out1", "add_out1", "out1"}, {}));
net.AddOp(f::OpRegistry::CreateOp("fc", {"out1", "w2", "b2"},
{"mul_out2", "tmp_out2", "out2"}, {}));
net.AddOp(f::OpRegistry::CreateOp("fc", {"out2", "w3", "b3"},
{"mul_out3", "tmp_out3", "out3"}, {}));
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
{{"mul_result", {"mul_out1"}},
{"add_result", {"add_out1"}},
{"Out", {"out1"}}},
{}));
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
{{"mul_result", {"mul_out2"}},
{"add_result", {"tmp_out2"}},
{"Out", {"out2"}}},
{}));
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
{{"mul_result", {"mul_out3"}},
{"add_result", {"tmp_out3"}},
{"Out", {"out3"}}},
{}));
net.CompleteAddOp();
auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
ASSERT_TRUE(backward->IsNetOp());
auto bwd_net = static_cast<ops::NetOp *>(backward.get());
ASSERT_EQ(bwd_net->ops_.size(), 3UL);
auto &grad_fc = *bwd_net->ops_[0];
EXPECT_EQ(grad_fc.inputs_.size(),
3UL /* external input number */
const char *all = paddle::operators::NetOp::kAll;
EXPECT_EQ(grad_fc.Inputs(all).size(),
2UL /* external input number */
+ 1UL /* external output number*/
+ 1UL /* number of gradient of external output*/
+ 2U /* internal variable number*/);
EXPECT_EQ(grad_fc.outputs_.size(), 2UL /* input number of mul*/
+ 2UL /* input number of rowwise_add */
+ 1UL /* input number of sigmod */);
EXPECT_EQ(bwd_net->ops_[1]->inputs_.size(), 0UL);
EXPECT_EQ(bwd_net->ops_[1]->outputs_.size(), 0UL);
EXPECT_EQ(bwd_net->ops_[2]->inputs_.size(), 0UL);
EXPECT_EQ(bwd_net->ops_[2]->outputs_.size(), 0UL);
EXPECT_EQ(grad_fc.Outputs(all).size(),
2UL /* input number of mul*/
+ 2UL /* input number of rowwise_add
*/
+ 1UL /* input number of sigmod */);
EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL);
EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL);
EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
}
......@@ -21,16 +21,16 @@ namespace framework {
/// @cond HIDDEN
template <int i>
Dim<i> make_dim(const int* d) {
Dim<i> make_dim(const int64_t* d) {
return Dim<i>(*d, make_dim<i - 1>(d + 1));
}
template <>
Dim<1> make_dim<1>(const int* d) {
Dim<1> make_dim<1>(const int64_t* d) {
return Dim<1>(*d);
}
void make_ddim(DDim& ddim, const int* dims, int n) {
void make_ddim(DDim& ddim, const int64_t* dims, int n) {
switch (n) {
case 1:
ddim = make_dim<1>(dims);
......@@ -67,13 +67,13 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
/// @endcond
DDim make_ddim(std::initializer_list<int> dims) {
DDim make_ddim(std::initializer_list<int64_t> dims) {
DDim result(make_dim(0));
make_ddim(result, dims.begin(), dims.size());
return result;
}
DDim make_ddim(const std::vector<int>& dims) {
DDim make_ddim(const std::vector<int64_t>& dims) {
DDim result(make_dim(0));
make_ddim(result, &dims[0], dims.size());
return result;
......@@ -81,12 +81,12 @@ DDim make_ddim(const std::vector<int>& dims) {
/// @cond HIDDEN
// XXX For some reason, putting this in an anonymous namespace causes errors
class DynamicMutableIndexer : public boost::static_visitor<int&> {
class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
public:
explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
template <int D>
int& operator()(Dim<D>& dim) const {
int64_t& operator()(Dim<D>& dim) const {
return dim[idx_];
}
......@@ -94,12 +94,12 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
int idx_;
};
class DynamicConstIndexer : public boost::static_visitor<int> {
class DynamicConstIndexer : public boost::static_visitor<int64_t> {
public:
explicit DynamicConstIndexer(int idx) : idx_(idx) {}
template <int D>
int operator()(const Dim<D>& dim) const {
int64_t operator()(const Dim<D>& dim) const {
return dim[idx_];
}
......@@ -109,22 +109,22 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
/// @endcond
int& DDim::operator[](int idx) {
int64_t& DDim::operator[](int idx) {
return boost::apply_visitor(DynamicMutableIndexer(idx), var);
}
int DDim::operator[](int idx) const {
int64_t DDim::operator[](int idx) const {
return boost::apply_visitor(DynamicConstIndexer(idx), var);
}
ssize_t DDim::size() const { return arity(*this); }
int64_t DDim::size() const { return arity(*this); }
bool DDim::operator==(DDim d) const {
if (var.which() != d.getVar().which()) {
return false;
} else {
std::vector<int> v1 = vectorize(*this);
std::vector<int> v2 = vectorize(d);
std::vector<int64_t> v1 = vectorize(*this);
std::vector<int64_t> v2 = vectorize(d);
for (unsigned int i = 0; i < v1.size(); i++) {
if (v1[i] != v2[i]) {
......@@ -139,10 +139,10 @@ bool DDim::operator==(DDim d) const {
bool DDim::operator!=(DDim d) const { return !(*this == d); }
DDim DDim::operator+(DDim d) const {
std::vector<int> v1 = vectorize(*this);
std::vector<int> v2 = vectorize(d);
std::vector<int64_t> v1 = vectorize(*this);
std::vector<int64_t> v2 = vectorize(d);
std::vector<int> v3;
std::vector<int64_t> v3;
assert(v1.size() == v2.size());
......@@ -154,10 +154,10 @@ DDim DDim::operator+(DDim d) const {
}
DDim DDim::operator*(DDim d) const {
std::vector<int> v1 = vectorize(*this);
std::vector<int> v2 = vectorize(d);
std::vector<int64_t> v1 = vectorize(*this);
std::vector<int64_t> v2 = vectorize(d);
std::vector<int> v3;
std::vector<int64_t> v3;
assert(v1.size() == v2.size());
......@@ -168,15 +168,15 @@ DDim DDim::operator*(DDim d) const {
return make_ddim(v3);
}
int get(const DDim& ddim, int idx) { return ddim[idx]; }
int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
/// @cond HIDDEN
struct VectorizeVisitor : public boost::static_visitor<> {
std::vector<int>& vector;
std::vector<int64_t>& vector;
explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
template <typename T>
void operator()(const T& t) {
......@@ -188,31 +188,31 @@ struct VectorizeVisitor : public boost::static_visitor<> {
};
/// @endcond
std::vector<int> vectorize(const DDim& ddim) {
std::vector<int> result;
std::vector<int64_t> vectorize(const DDim& ddim) {
std::vector<int64_t> result;
VectorizeVisitor visitor(result);
boost::apply_visitor(visitor, ddim);
return result;
}
struct ProductVisitor : public boost::static_visitor<ssize_t> {
struct ProductVisitor : public boost::static_visitor<int64_t> {
template <int D>
ssize_t operator()(const Dim<D>& dim) {
int64_t operator()(const Dim<D>& dim) {
return product(dim);
}
};
ssize_t product(const DDim& ddim) {
int64_t product(const DDim& ddim) {
ProductVisitor visitor;
return boost::apply_visitor(visitor, ddim);
}
struct SliceVectorizeVisitor : public boost::static_visitor<> {
std::vector<int>& vector;
std::vector<int64_t>& vector;
int begin;
int end;
SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
: vector(v), begin(b), end(e) {
PADDLE_ENFORCE(begin < end,
"Begin index must be less than end index in ddim slice.");
......@@ -240,7 +240,7 @@ struct SliceVectorizeVisitor : public boost::static_visitor<> {
};
DDim slice_ddim(const DDim& dim, int begin, int end) {
std::vector<int> vec;
std::vector<int64_t> vec;
vec.reserve(end - begin);
SliceVectorizeVisitor visitor(vec, begin, end);
boost::apply_visitor(visitor, dim);
......@@ -280,9 +280,17 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
return os;
}
DDim::DDim(std::initializer_list<int> init_list) {
DDim::DDim(std::initializer_list<int64_t> init_list) {
*this = make_ddim(init_list);
}
DDim flatten_to_2d(const DDim& src, int num_col_dims) {
int rank = src.size();
return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
product(slice_ddim(src, num_col_dims, rank))});
}
DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
} // namespace framework
} // namespace paddle
......@@ -40,7 +40,7 @@ struct DDim {
template <int D>
explicit DDim(const Dim<D>& in) : var(in) {}
/*implicit*/ DDim(std::initializer_list<int> init_list);
/*implicit*/ DDim(std::initializer_list<int64_t> init_list);
template <int D>
DDim& operator=(const Dim<D>& in) {
......@@ -48,8 +48,8 @@ struct DDim {
return *this;
}
int& operator[](int idx);
int operator[](int idx) const;
int64_t& operator[](int idx);
int64_t operator[](int idx) const;
template <typename Visitor>
typename Visitor::result_type apply_visitor(Visitor& visitor) {
......@@ -71,15 +71,15 @@ struct DDim {
DDim operator*(DDim d) const;
ssize_t size() const;
int64_t size() const;
};
/**
* \brief Make a DDim from std::vector<int>
* \brief Make a DDim from std::vector<int64_t>
*
* \param dims An vector of ints. Must be sized between [1, 9]
*/
DDim make_ddim(const std::vector<int>& dims);
DDim make_ddim(const std::vector<int64_t>& dims);
/**
* \brief Make a DDim from an initializer list
......@@ -87,14 +87,14 @@ DDim make_ddim(const std::vector<int>& dims);
* \param dims An initializer list of ints. Must be sized between [1, 9]
*
*/
DDim make_ddim(std::initializer_list<int> dims);
DDim make_ddim(std::initializer_list<int64_t> dims);
int get(const DDim& dim, int idx);
int64_t get(const DDim& dim, int idx);
void set(DDim& dim, int idx, int val);
std::vector<int> vectorize(const DDim& ddim);
std::vector<int64_t> vectorize(const DDim& ddim);
ssize_t product(const DDim& ddim);
int64_t product(const DDim& ddim);
/**
* \brief Slice a ddim
......@@ -115,6 +115,12 @@ int arity(const DDim& ddim);
std::ostream& operator<<(std::ostream&, const DDim&);
// Reshape a tensor to a matrix. The matrix's first dimension(column length)
// will be the product of tensor's first `num_col_dims` dimensions.
DDim flatten_to_2d(const DDim& src, int num_col_dims);
DDim flatten_to_1d(const DDim& src);
} // namespace framework
} // namespace paddle
......
......@@ -12,7 +12,7 @@ TEST(DDim, Equality) {
EXPECT_EQ(ddim[2], 5);
// construct a DDim from a vector
std::vector<int> vec({9, 1, 5});
std::vector<int64_t> vec({9, 1, 5});
paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
EXPECT_EQ(ddim[0], 9);
EXPECT_EQ(ddim[1], 1);
......@@ -25,7 +25,7 @@ TEST(DDim, Equality) {
EXPECT_EQ(paddle::framework::get(ddim, 0), 6);
// vectorize a DDim
std::vector<int> res_vec = paddle::framework::vectorize(vddim);
std::vector<int64_t> res_vec = paddle::framework::vectorize(vddim);
EXPECT_EQ(res_vec[0], 9);
EXPECT_EQ(res_vec[1], 1);
EXPECT_EQ(res_vec[2], 5);
......
此差异已折叠。
......@@ -8,7 +8,7 @@ __global__ void test(paddle::framework::Dim<2>* o) {
o[0] = paddle::framework::make_dim(5, 6);
}
__global__ void dyn_idx_gpu(int* o) {
__global__ void dyn_idx_gpu(int64_t* o) {
auto d = paddle::framework::make_dim(5, 6);
o[0] = d[1];
}
......@@ -47,9 +47,9 @@ TEST(Dim, Equality) {
EXPECT_EQ(b[1], 11);
// dynamic access on GPU
thrust::device_vector<int> r(1);
thrust::device_vector<int64_t> r(1);
dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
int res = r[0];
int64_t res = r[0];
EXPECT_EQ(res, 6);
// ex_prefix_mul
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
文件模式从 100755 更改为 100644
此差异已折叠。
文件模式从 100755 更改为 100644
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册