提交 ddc25873 编写于 作者: D dongzhihong

Merge remote-tracking branch 'origin/develop' into multigpu

......@@ -22,7 +22,9 @@ cmake-build-*
# generated while compiling
python/paddle/v2/framework/core.so
paddle/pybind/pybind.h
CMakeFiles
cmake_install.cmake
paddle/.timestamp
python/paddlepaddle.egg-info/
paddle/pybind/pybind.h
......@@ -36,10 +36,6 @@ before_install:
# protobuf version.
- sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
- sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
- curl https://glide.sh/get | bash
- eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
- go get -u github.com/alecthomas/gometalinter
- gometalinter --install
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
......
......@@ -27,7 +27,7 @@ if(NOT CMAKE_CROSSCOMPILING)
endif(NOT CMAKE_CROSSCOMPILING)
find_package(Git REQUIRED)
find_package(Threads REQUIRED)
if(NOT ANDROID)
if(NOT ANDROID AND NOT IOS)
find_package(Boost QUIET)
endif()
......@@ -64,27 +64,29 @@ if(NOT CMAKE_BUILD_TYPE)
FORCE)
endif()
if(ANDROID)
if(ANDROID OR IOS)
if(ANDROID)
if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
# TODO: support glog for Android api 16 ~ 19 in the future
message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
endif()
endif()
set(WITH_GPU OFF CACHE STRING
"Disable GPU when cross-compiling for Android" FORCE)
"Disable GPU when cross-compiling for Android and iOS" FORCE)
set(WITH_AVX OFF CACHE STRING
"Disable AVX when cross-compiling for Android" FORCE)
"Disable AVX when cross-compiling for Android and iOS" FORCE)
set(WITH_PYTHON OFF CACHE STRING
"Disable PYTHON when cross-compiling for Android" FORCE)
"Disable PYTHON when cross-compiling for Android and iOS" FORCE)
set(WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android" FORCE)
"Disable RDMA when cross-compiling for Android and iOS" FORCE)
set(WITH_MKLDNN OFF CACHE STRING
"Disable MKLDNN when cross-compiling for Android" FORCE)
"Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
set(WITH_MKLML OFF CACHE STRING
"Disable MKLML package when cross-compiling for Android" FORCE)
endif(ANDROID)
"Disable MKLML package when cross-compiling for Android and iOS" FORCE)
endif()
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
"A path setting third party libraries download & build directories.")
......
......@@ -51,19 +51,19 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
- **Connected to Products**
In addition, PaddlePaddle is also designed to be easily deployable. At Baidu,
PaddlePaddle has been deployed into products or service with a vast number
PaddlePaddle has been deployed into products and services with a vast number
of users, including ad click-through rate (CTR) prediction, large-scale image
classification, optical character recognition(OCR), search ranking, computer
virus detection, recommendation, etc. It is widely utilized in products at
Baidu and it has achieved a significant impact. We hope you can also exploit
the capability of PaddlePaddle to make a huge impact for your product.
Baidu and it has achieved a significant impact. We hope you can also explore
the capability of PaddlePaddle to make an impact on your product.
## Installation
It is recommended to check out the
[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
before looking into the
[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html).
## Documentation
......@@ -72,7 +72,7 @@ We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
- [Deep Learning 101](http://book.paddlepaddle.org/index.html)
You might want to start from this online interactive book that can run in Jupyter Notebook.
You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
......
......@@ -22,5 +22,5 @@ def initHook(settings, height, width, color, num_class, **kwargs):
def process(settings, file_list):
for i in xrange(1024):
img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
lab = random.randint(0, settings.num_class)
lab = random.randint(0, settings.num_class - 1)
yield img.astype('float32'), int(lab)
set -e
function train() {
unset OMP_NUM_THREADS MKL_NUM_THREADS
export OMP_DYNAMIC="FALSE"
export KMP_AFFINITY="granularity=fine,compact,0,0"
topology=$1
bs=$2
use_mkldnn=$3
if [ $3 == "True" ]; then
thread=1
log="logs/${topology}-mkldnn-${bs}.log"
elif [ $3 == "False" ]; then
thread=`nproc`
# each trainer_count use only 1 core to avoid conflict
export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=1
log="logs/${topology}-${thread}mklml-${bs}.log"
else
echo "Wrong input $3, use True or False."
exit 0
fi
args="batch_size=${bs}"
config="${topology}.py"
paddle train --job=time \
--config=$config \
--use_mkldnn=$use_mkldnn \
--use_gpu=False \
--trainer_count=$thread \
--log_period=10 \
--test_period=100 \
--config_args=$args \
2>&1 | tee ${log}
}
if [ ! -d "train.list" ]; then
echo " " > train.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
#========== mkldnn ==========#
train vgg 64 True
train vgg 128 True
train vgg 256 True
#========== mklml ===========#
train vgg 64 False
train vgg 128 False
train vgg 256 False
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 224
width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg('layer_num', int, 19)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
img = data_layer(name='image', size=height * width * 3)
def vgg_network(vgg_num=3):
tmp = img_conv_group(
input=img,
num_channels=3,
conv_padding=1,
conv_num_filter=[64, 64],
conv_filter_size=3,
conv_act=ReluActivation(),
pool_size=2,
pool_stride=2,
pool_type=MaxPooling())
tmp = img_conv_group(
input=tmp,
conv_num_filter=[128, 128],
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
channels = []
for i in range(vgg_num):
channels.append(256)
tmp = img_conv_group(
input=tmp,
conv_num_filter=channels,
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
channels = []
for i in range(vgg_num):
channels.append(512)
tmp = img_conv_group(
input=tmp,
conv_num_filter=channels,
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
tmp = img_conv_group(
input=tmp,
conv_num_filter=channels,
conv_padding=1,
conv_filter_size=3,
conv_act=ReluActivation(),
pool_stride=2,
pool_type=MaxPooling(),
pool_size=2)
tmp = fc_layer(
input=tmp,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
tmp = fc_layer(
input=tmp,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
if layer_num == 16:
vgg = vgg_network(3)
elif layer_num == 19:
vgg = vgg_network(4)
else:
print("Wrong layer number.")
lab = data_layer('label', num_class)
loss = cross_entropy(input=vgg, label=lab)
outputs(loss)
......@@ -171,3 +171,10 @@ if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
endif()
if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
set(CBLAS_FOUND ON)
set(CBLAS_PROVIDER vecLib)
set(CBLAS_INC_DIR ${VECLIB_INC_DIR})
add_definitions(-DPADDLE_USE_VECLIB)
endif()
......@@ -26,9 +26,9 @@ set(IGNORE_PATTERN
.*ImportanceSampler.*
.*cblas\\.h.*
.*\\.pb\\.txt
.*LtrDataProvider.*
.*MultiDataProvider.*
.*pb.*)
.*pb.*
.*pybind.h)
# add_style_check_target
#
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This is a toolchain file for cross-compiling for iOS, and the
# configuration largely refers to public toolchain file:
# https://raw.githubusercontent.com/leetal/ios-cmake/master/ios.toolchain.cmake
# and
# https://github.com/cristeab/ios-cmake
#
# Supports options:
# IOS_PLATFORM = OS (default) or SIMULATOR
# This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
# OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
# SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
# IOS_ARCH
# The archectures wanted to support, such "arm64", "armv7;arm64"
# IOS_DEPLOYMENT_TARGET
# The minimum iOS deployment version, such as "7.0"
# IOS_ENABLE_BITCODE = ON (default) or OFF
# IOS_USE_VECLIB_FOR_BLAS = OFF (default) or ON
# IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
# By default this location is automatcially chosen based on the IOS_PLATFORM value above.
# If set manually, it will override the default location and force the user of a particular Developer Platform
# IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
# By default this location is automatcially chosen based on the IOS_DEVELOPER_ROOT value.
# In this case it will always be the most up-to-date SDK found in the IOS_DEVELOPER_ROOT path.
# If set manually, this will force the use of a specific SDK version
# Macros:
# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
# A convenience macro for setting xcode specific properties on targets
# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
# find_host_package (PROGRAM ARGS)
# A macro used to find executable programs on the host system, not within the iOS environment.
# Thanks to the android-cmake project for providing the command
if(NOT IOS)
return()
endif()
set(CMAKE_SYSTEM_NAME Darwin)
# Get the Xcode version being used.
execute_process(COMMAND xcodebuild -version
OUTPUT_VARIABLE XCODE_VERSION
RESULT_VARIABLE XCODE_VERSION_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT ${XCODE_VERSION_RESULT})
string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
else()
message(FATAL_ERROR "Cannot execute xcodebuild, please check whether xcode is installed.")
endif()
# Required as of cmake 2.8.10
set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
# Setup iOS platform unless specified manually with IOS_PLATFORM
if(NOT DEFINED IOS_PLATFORM)
set(IOS_PLATFORM "OS")
endif()
set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
# Set the architecture for iOS
if(NOT DEFINED IOS_ARCH)
if(IOS_PLATFORM STREQUAL "OS")
# FIXME(liuyiqun): support "armv7;armv7s;arm64" future
set(IOS_ARCH "arm64")
elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
set(IOS_ARCH "i386;x86_64")
elseif(IOS_PLATFORM STREQUAL "WATCHOS")
set(IOS_ARCH armv7k)
endif()
endif()
set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
# Specify minimum iOS deployment version
if(NOT DEFINED IOS_DEPLOYMENT_TARGET)
set(IOS_DEPLOYMENT_TARGET "7.0")
endif()
set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version")
# Whether to enable bitcode
if(NOT DEFINED IOS_ENABLE_BITCODE)
set(IOS_ENABLE_BITCODE ON)
endif()
set(IOS_ENABLE_BITCODE ${IOS_ENABLE_BITCODE} CACHE BOOL "Whether to enable bitcode")
if(NOT DEFINED IOS_USE_VECLIB_FOR_BLAS)
set(IOS_USE_VECLIB_FOR_BLAS OFF)
endif()
set(IOS_USE_VECLIB_FOR_BLAS ${IOS_UES_VECLIB_FOR_BLAS} CACHE BOOL "Whether to use veclib")
# Check the platform selection and setup for developer root
if(${IOS_PLATFORM} STREQUAL "OS")
set(IOS_PLATFORM_LOCATION "iPhoneOS.platform")
set(XCODE_IOS_PLATFORM iphoneos)
# This causes the installers to properly locate the output libraries
set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
elseif(${IOS_PLATFORM} STREQUAL "SIMULATOR")
set(IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
set(XCODE_IOS_PLATFORM iphonesimulator)
# This causes the installers to properly locate the output libraries
set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
elseif(${IOS_PLATFORM} STREQUAL "WATCHOS")
set(IOS_PLATFORM_LOCATION "WatchOS.platform")
set(XCODE_IOS_PLATFORM watchos)
# This causes the installers to properly locate the output libraries
set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-watchos")
else(${IOS_PLATFORM} STREQUAL "OS")
message(FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please set to\n"
"\t OS, SIMULATOR, or WATCHOS.")
endif()
# Check iOS developer toolchain
if(NOT DEFINED IOS_DEVELOPER_ROOT)
# Setup iOS developer location
execute_process(COMMAND xcode-select -print-path
OUTPUT_VARIABLE XCODE_DEVELOPER_DIR
RESULT_VARIABLE XCODE_DEVELOPER_DIR_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
# Xcode 4.3 changed the installation location, choose the most recent one available
if(${XCODE_VERSION} VERSION_LESS "4.3.0")
set(IOS_DEVELOPER_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
else()
set(IOS_DEVELOPER_ROOT "${XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
endif()
endif()
if(EXISTS ${IOS_DEVELOPER_ROOT})
set(IOS_DEVELOPER_ROOT ${IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
else()
message(FATAL_ERROR "Invalid IOS_DEVELOPER_ROOT: ${IOS_DEVELOPER_ROOT} does not exist.")
endif()
# Check iOS SDK
if(NOT DEFINED IOS_SDK_ROOT)
# Find and use the most recent iOS sdk
file(GLOB IOS_SDK_LISTS "${IOS_DEVELOPER_ROOT}/SDKs/*")
if(IOS_SDK_LISTS)
list(SORT IOS_SDK_LISTS)
list(REVERSE IOS_SDK_LISTS)
list(GET IOS_SDK_LISTS 0 IOS_SDK_ROOT)
else(IOS_SDK_LISTS)
message(FATAL_ERROR "No iOS SDK's found in default search path ${IOS_DEVELOPER_ROOT}."
" Please manually set IOS_SDK_ROOT or install the iOS SDK.")
endif(IOS_SDK_LISTS)
endif()
if(EXISTS ${IOS_SDK_ROOT})
set(IOS_SDK_ROOT ${IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
message(STATUS "iOS toolchain: ${IOS_SDK_ROOT}")
else()
message(FATAL_ERROR "Invalid IOS_SDK_ROOT: ${IOS_SDK_ROOT} does not exist.")
endif()
# Set the sysroot default to the most recent SDK
set(CMAKE_OSX_SYSROOT ${IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
# Get version of iOS SDK
execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
OUTPUT_VARIABLE IOS_SDK_VERSION
RESULT_VARIABLE IOS_SDK_VERSION_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(${IOS_SDK_VERSION_RESULT})
string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" IOS_SDK_VERSION "${IOS_SDK_ROOT}")
endif()
if(NOT IOS_SDK_VERSION)
message(WARNING "Cannot get SDK's version.")
set(IOS_SDK_VERSION 1)
endif()
set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION})
# Find the C & C++ compilers for the specified SDK.
if(NOT CMAKE_C_COMPILER)
# Default to use clang
execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
OUTPUT_VARIABLE IOS_C_COMPILER
RESULT_VARIABLE IOS_C_COMPILER_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(${IOS_C_COMPILER_RESULT})
get_filename_component(IOS_C_COMPILER clang PROGRAM)
endif()
else(NOT CMAKE_C_COMPILER)
# User can set it in cmake command
get_filename_component(IOS_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
endif(NOT CMAKE_C_COMPILER)
if(NOT EXISTS ${IOS_C_COMPILER})
message(FATAL_ERROR "Cannot find C compiler: ${IOS_C_COMPILER}")
endif()
if(NOT CMAKE_CXX_COMPILER)
# Default to use clang++
execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
OUTPUT_VARIABLE IOS_CXX_COMPILER
RESULT_VARIABLE IOS_CXX_COMPILER_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(${IOS_CXX_COMPILER_RESULT})
get_filename_component(IOS_CXX_COMPILER clang++ PROGRAM)
endif()
else(NOT CMAKE_CXX_COMPILER)
# User can set it in cmake command
get_filename_component(IOS_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
endif(NOT CMAKE_CXX_COMPILER)
if(NOT EXISTS ${IOS_CXX_COMPILER})
message(FATAL_ERROR "Cannot find CXX compiler: ${IOS_CXX_COMPILER}")
endif()
set(CMAKE_C_COMPILER ${IOS_C_COMPILER} CACHE PATH "C compiler" FORCE)
set(CMAKE_CXX_COMPILER ${IOS_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
# Set iOS specific C/C++ flags
if(IOS_PLATFORM STREQUAL "OS")
if(XCODE_VERSION VERSION_LESS "7.0")
set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
else()
# Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM.
set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
endif()
else()
set(XCODE_IOS_FLATFORM_VERSION_FLAGS "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
endif()
if(IOS_ENABLE_BITCODE)
set(XCODE_IOS_BITCODE_FLAGS "${IOS_COMPILER_FLAGS} -fembed-bitcode")
else()
set(XCODE_IOS_BITCODE_FLAGS "")
endif()
set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_FLAGS}")
# Hidden visibilty is required for cxx on iOS
set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
if(IOS_USE_VECLIB_FOR_BLAS)
# Find vecLib for iOS
set(VECLIB_SEARCH_DIRS
${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks
${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Frameworks
)
find_path(VECLIB_INC_DIR vecLib.h PATHS ${VECLIB_SEARCH_DIRS}/vecLib.framework/Headers)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(vecLib DEFAULT_MSG VECLIB_INC_DIR)
if(VECLIB_FOUND)
if(VECLIB_INC_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework vecLib")
message(STATUS "Found standalone vecLib.framework")
else()
set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework Accelerate")
message(STATUS "Found vecLib as part of Accelerate.framework")
endif()
endif()
endif()
set(CMAKE_C_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_C_LINK_FLAGS}")
set(CMAKE_CXX_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_CXX_LINK_FLAGS}")
set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
if(NOT IOS_ENABLE_BITCODE)
set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
else()
set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib")
set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle")
endif()
set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
endif()
# Set the find root to the iOS developer roots and to user defined paths
set(CMAKE_FIND_ROOT_PATH ${IOS_DEVELOPER_ROOT} ${IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH}
CACHE string "iOS find search path root")
# default to searching for frameworks first
set(CMAKE_FIND_FRAMEWORK FIRST)
# set up the default search directories for frameworks
set(CMAKE_SYSTEM_FRAMEWORK_PATH
${IOS_SDK_ROOT}/System/Library/Frameworks
${IOS_SDK_ROOT}/System/Library/PrivateFrameworks
${IOS_SDK_ROOT}/Developer/Library/Frameworks
)
# only search the iOS sdks, not the remainder of the host filesystem
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
message(STATUS "iOS: Targeting iOS '${CMAKE_SYSTEM_VERSION}', "
"building for '${IOS_PLATFORM}' platform, with architecture '${CMAKE_OSX_ARCHITECTURES}'")
message(STATUS "System CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
message(STATUS "System CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
# Used in ExternalProject command
string(REPLACE ";" "\\$<SEMICOLON>" EXTERNAL_IOS_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
set(EXTERNAL_OPTIONAL_ARGS
-DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT}
-DCMAKE_OSX_ARCHITECTURES=${EXTERNAL_IOS_ARCHITECTURES})
# This little macro lets you set any XCode specific property
macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
endmacro(set_xcode_property)
# This macro lets you find executable programs on the host system
macro(find_host_package)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
set(IOS FALSE)
find_package(${ARGN})
set(IOS TRUE)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
endmacro(find_host_package)
......@@ -39,13 +39,14 @@ ExternalProject_Add(
PREFIX ${GFLAGS_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DBUILD_TESTING=OFF
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF
-DCMAKE_BUILD_TYPE=Release
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
......
......@@ -34,16 +34,17 @@ ExternalProject_Add(
PREFIX ${GLOG_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DWITH_GFLAGS=ON
CMAKE_ARGS -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
CMAKE_ARGS -DBUILD_TESTING=OFF
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DWITH_GFLAGS=ON
-Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
-DBUILD_TESTING=OFF
-DCMAKE_BUILD_TYPE=Release
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
......
......@@ -48,15 +48,16 @@ IF(WITH_TESTING)
PREFIX ${GTEST_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DBUILD_GMOCK=ON
CMAKE_ARGS -Dgtest_disable_pthreads=ON
CMAKE_ARGS -Dgtest_force_shared_crt=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_GMOCK=ON
-Dgtest_disable_pthreads=ON
-Dgtest_force_shared_crt=ON
-DCMAKE_BUILD_TYPE=Release
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
......
......@@ -29,30 +29,41 @@ IF(NOT ${CBLAS_FOUND})
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
IF(APPLE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
ELSE()
SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
ENDIF()
SET(OPENBLAS_CC "${CMAKE_C_COMPILER}")
IF(CMAKE_CROSSCOMPILING)
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
IF(ANDROID)
# arm_soft_fp_abi branch of OpenBLAS to support softfp
# https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
SET(TARGET "ARMV7")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(TARGET "ARMV8")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF()
ELSEIF(IOS)
# FIXME(liuyiqun): support multiple architectures
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
ENDIF()
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(RPI)
# use hardfp
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0)
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
ENDIF()
ELSE()
IF(APPLE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
ENDIF()
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS "")
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
......@@ -60,6 +71,8 @@ IF(NOT ${CBLAS_FOUND})
ENDIF()
ENDIF()
SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
ExternalProject_Add(
extern_openblas
${EXTERNAL_PROJECT_LOG_ARGS}
......
......@@ -173,7 +173,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
"-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
"-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
"-Dprotobuf_WITH_ZLIB=ON"
"-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}")
"-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
${EXTERNAL_OPTIONAL_ARGS})
SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
ENDIF()
......
......@@ -12,16 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
INCLUDE(ExternalProject)
IF(NOT WITH_PYTHON)
return()
ENDIF()
INCLUDE(python_module)
FIND_PACKAGE(PythonInterp 2.7)
IF(WITH_PYTHON)
FIND_PACKAGE(PythonLibs 2.7)
# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
ENDIF(WITH_PYTHON)
FIND_PACKAGE(PythonLibs 2.7)
# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
SET(py_env "")
IF(PYTHONINTERP_FOUND)
......@@ -36,9 +37,5 @@ IF(PYTHONINTERP_FOUND)
ENDIF()
ENDIF(PYTHONINTERP_FOUND)
IF(WITH_PYTHON)
INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
ELSE()
SET(PYTHON_LIBRARIES "")
ENDIF()
INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
......@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT WITH_SWIG_PY)
return()
ENDIF()
FIND_PACKAGE(SWIG)
IF(NOT SWIG_FOUND)
......
......@@ -16,25 +16,14 @@ INCLUDE(ExternalProject)
SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE)
IF(WIN32)
SET(WARPCTC_LIBRARIES
"${WARPCTC_INSTALL_DIR}/lib/warpctc.dll" CACHE FILEPATH "Warp-ctc Library" FORCE)
ELSE(WIN32)
IF(APPLE)
SET(_warpctc_SHARED_SUFFIX dylib)
ELSE(APPLE)
SET(_warpctc_SHARED_SUFFIX so)
ENDIF(APPLE)
SET(WARPCTC_LIBRARIES
"${WARPCTC_INSTALL_DIR}/lib/libwarpctc.${_warpctc_SHARED_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE)
ENDIF(WIN32)
SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
CACHE PATH "Warp-ctc Directory" FORCE)
# Used in unit test test_WarpCTCLayer
SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
CACHE PATH "Warp-ctc Library Directory" FORCE)
SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
CACHE FILEPATH "Warp-ctc Library" FORCE)
IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
SET(USE_OMP OFF)
......@@ -49,22 +38,26 @@ ExternalProject_Add(
PREFIX ${WARPCTC_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
CMAKE_ARGS -DWITH_GPU=${WITH_GPU}
CMAKE_ARGS -DWITH_OMP=${USE_OMP}
CMAKE_ARGS -DWITH_TORCH=OFF
CMAKE_ARGS -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
CMAKE_ARGS -DBUILD_SHARED=ON
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-DWITH_GPU=${WITH_GPU}
-DWITH_OMP=${USE_OMP}
-DWITH_TORCH=OFF
-DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-DBUILD_SHARED=ON
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=Release
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
)
MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
ADD_DEPENDENCIES(warpctc extern_warpctc)
......
......@@ -34,15 +34,16 @@ ExternalProject_Add(
GIT_TAG "v1.2.8"
PREFIX ${ZLIB_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF
CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON
CMAKE_ARGS -DCMAKE_MACOSX_RPATH=ON
CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
-DBUILD_SHARED_LIBS=OFF
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_MACOSX_RPATH=ON
-DCMAKE_BUILD_TYPE=Release
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release
......
......@@ -128,8 +128,10 @@ set(GPU_COMMON_FLAGS
)
if (APPLE)
if(NOT CMAKE_CROSSCOMPILING)
# On Mac OS X build fat binaries with x86_64 architectures by default.
set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
endif()
else()
set(GPU_COMMON_FLAGS
-Wall
......
......@@ -106,22 +106,22 @@ function(merge_static_libs TARGET_NAME)
endforeach()
list(REMOVE_DUPLICATES libs_deps)
if(APPLE) # Use OSX's libtool to merge archives
# To produce a library we need at least one source file.
# It is created by add_custom_command below and will helps
# also help to track dependencies.
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
if(APPLE) # Use OSX's libtool to merge archives
# Make the generated dummy source file depended on all static input
# libs. If input lib changes,the source file is touched
# which causes the desired effect (relink).
add_custom_command(OUTPUT ${dummyfile}
COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
add_custom_command(OUTPUT ${target_SRCS}
COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
DEPENDS ${libs})
# Generate dummy staic lib
file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
add_library(${TARGET_NAME} STATIC ${dummyfile})
file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
add_library(${TARGET_NAME} STATIC ${target_SRCS})
target_link_libraries(${TARGET_NAME} ${libs_deps})
foreach(lib ${libs})
......@@ -130,11 +130,14 @@ function(merge_static_libs TARGET_NAME)
endforeach()
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
)
else() # general UNIX: use "ar" to extract objects and re-add to a common lib
set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
foreach(lib ${libs})
set(objlistfile ${lib}.objlist) # list of objects in the input library
set(objdir ${lib}.objdir)
set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library
set(objdir ${target_DIR}/${lib}.objdir)
add_custom_command(OUTPUT ${objdir}
COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
......@@ -142,31 +145,32 @@ function(merge_static_libs TARGET_NAME)
add_custom_command(OUTPUT ${objlistfile}
COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ../${objlistfile}
COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ${objlistfile}
DEPENDS ${lib} ${objdir}
WORKING_DIRECTORY ${objdir})
# Empty dummy source file that goes into merged library
set(mergebase ${lib}.mergebase.c)
add_custom_command(OUTPUT ${mergebase}
COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
DEPENDS ${objlistfile})
list(APPEND mergebases "${mergebase}")
list(APPEND target_OBJS "${objlistfile}")
endforeach()
add_library(${TARGET_NAME} STATIC ${mergebases})
# Make the generated dummy source file depended on all static input
# libs. If input lib changes,the source file is touched
# which causes the desired effect (relink).
add_custom_command(OUTPUT ${target_SRCS}
COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
DEPENDS ${libs} ${target_OBJS})
# Generate dummy staic lib
file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
add_library(${TARGET_NAME} STATIC ${target_SRCS})
target_link_libraries(${TARGET_NAME} ${libs_deps})
# Get the file name of the generated library
set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
set(target_LIBNAME "$<TARGET_FILE:${TARGET_NAME}>")
foreach(lib ${libs})
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
COMMAND ${CMAKE_RANLIB} ${outlibfile}
WORKING_DIRECTORY ${lib}.objdir)
endforeach()
COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
WORKING_DIRECTORY ${target_DIR})
endif()
endfunction(merge_static_libs)
......@@ -196,7 +200,7 @@ function(cc_library TARGET_NAME)
add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
else(cc_library_SRCS)
if (cc_library_DEPS)
if(cc_library_DEPS)
merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
else()
message(FATAL "Please specify source file or library in cc_library.")
......@@ -249,7 +253,7 @@ function(nv_library TARGET_NAME)
foreach(source_file ${nv_library_SRCS})
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
......
......@@ -24,11 +24,10 @@ IF(WIN32)
SET(HOST_SYSTEM "win32")
ELSE(WIN32)
IF(APPLE)
EXEC_PROGRAM (sw_vers ARGS -productVersion OUTPUT_VARIABLE MACOSX_VERSION)
STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
SET(MACOS_VERSION ${VERSION})
SET(HOST_SYSTEM "macosx")
IF(NOT DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION)
STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}")
IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET})
# Set cache variable - end user may change this during ccmake or cmake-gui configure.
SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
"Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
......@@ -49,6 +48,8 @@ ELSE(WIN32)
ELSEIF(LINUX_ISSUE MATCHES "Fedora")
SET(HOST_SYSTEM "fedora")
ENDIF()
STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}")
ENDIF(EXISTS "/etc/issue")
IF(EXISTS "/etc/redhat-release")
......@@ -70,7 +71,7 @@ CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
# configuration for cross-compiling
......@@ -82,6 +83,9 @@ IF(DEFINED CMAKE_SYSTEM_NAME)
ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi")
SET(RPI TRUE)
INCLUDE(cross_compiling/raspberry_pi)
ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
SET(IOS TRUE)
INCLUDE(cross_compiling/ios)
ENDIF()
ENDIF()
......
......@@ -25,8 +25,10 @@ function(target_circle_link_libraries TARGET_NAME)
endif()
endforeach()
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
if(NOT IOS_ENABLE_BITCODE)
list(APPEND LIBS "-undefined dynamic_lookup")
endif()
endif()
list(REVERSE libsInArgn)
target_link_libraries(${TARGET_NAME}
${LIBS}
......@@ -95,6 +97,10 @@ function(link_paddle_exe TARGET_NAME)
target_link_libraries(${TARGET_NAME} log)
endif(ANDROID)
if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
endif()
add_dependencies(${TARGET_NAME} ${external_project_dependencies})
endfunction()
......
......@@ -345,6 +345,11 @@ clip
.. autoclass:: paddle.v2.layer.clip
:noindex:
resize
------
.. autoclass:: paddle.v2.layer.resize
:noindex:
slope_intercept
---------------
.. autoclass:: paddle.v2.layer.slope_intercept
......
......@@ -3,7 +3,7 @@
## Ingredients
As our design principle is starting from the essence: how could we
allow users to express and solve their problems at neural networks.
allow users to express and solve their problems as neural networks.
Some essential concepts that our API have to provide include:
1. A *topology* is an expression of *layers*.
......@@ -233,7 +233,7 @@ paddle.dist_train(model,
num_parameter_servers=15)
```
The pseudo code if `paddle.dist_train` is as follows:
The pseudo code of `paddle.dist_train` is as follows:
```python
def dist_train(topology, parameters, trainer, reader, ...):
......
## Auto Gradient Checker Design
## Backgraound:
- Operator forward computing is easy to check if the result is right because it has a clear definition. **But** backpropagation is a notoriously difficult algorithm to debug and get right:
- 1. you should get the right backpropagation formula according to the forward computation.
- 2. you should implement it right in CPP.
- 3. it's difficult to prepare test data.
- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right:
1. you should get the right backpropagation formula according to the forward computation.
2. you should implement it right in CPP.
3. it's difficult to prepare test data.
- Auto gradient check gets a numeric gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
- 1. numeric gradient checker only need forward operator.
- 2. user only need to prepare the input data for forward Operator.
- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
1. numerical gradient checker only need forward operator.
2. user only need to prepare the input data for forward Operator.
## Mathematical Theory
The following two document from stanford has a detailed explanation of how to get numeric gradient and why it's useful.
The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful.
- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
......@@ -20,7 +20,7 @@ The following two document from stanford has a detailed explanation of how to ge
## Numeric Gradient Implementation
### Python Interface
```python
def get_numeric_gradient(op,
def get_numerical_gradient(op,
input_values,
output_name,
input_to_check,
......@@ -30,13 +30,13 @@ def get_numeric_gradient(op,
Get Numeric Gradient for an operator's input.
:param op: C++ operator instance, could be an network
:param input_values: The input variables. Should be an dictionary, key is
variable name. Value is numpy array.
:param input_values: The input variables. Should be an dictionary, whose key is
variable name, and value is numpy array.
:param output_name: The final output variable name.
:param input_to_check: The input variable need to get gradient.
:param input_to_check: The input variable with respect to which to compute the gradient.
:param delta: The perturbation value for numeric gradient method. The
smaller delta is, the more accurate result will get. But if that delta is
too small, it could occur numerical stability problem.
too small, it will suffer from numerical stability problem.
:param local_scope: The local scope used for get_numeric_gradient.
:return: The gradient array in numpy format.
"""
......@@ -45,28 +45,28 @@ def get_numeric_gradient(op,
### Explaination:
- Why need `output_name`
- One Operator may have multiple Output, you can get independent gradient from each Output. So user should set one output to calculate.
- An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable.
- Why need `input_to_check`
- One operator may have multiple inputs. Gradient Op can calculate the gradient of these Inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
- One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
### Core Algorithm Implementation
```python
# we only compute gradient of one element each time.
# we use a for loop to compute the gradient of every element.
# we only compute gradient of one element a time.
# we use a for loop to compute the gradient of each element.
for i in xrange(tensor_size):
# get one input element throw it's index i.
# get one input element by its index i.
origin = tensor_to_check.get_float_element(i)
# add delta to it, run op and then get the sum of the result tensor.
# add delta to it, run op and then get the new value of the result tensor.
x_pos = origin + delta
tensor_to_check.set_float_element(i, x_pos)
y_pos = get_output()
# plus delta to this element, run op and get the sum of the result tensor.
# plus delta to this element, run op and get the new value of the result tensor.
x_neg = origin - delta
tensor_to_check.set_float_element(i, x_neg)
y_neg = get_output()
......@@ -85,15 +85,15 @@ def get_numeric_gradient(op,
Each Operator Kernel has three kinds of Gradient:
- 1. Numeric Gradient
- 2. CPU Operator Gradient
- 3. GPU Operator Gradient(if supported)
1. Numerical gradient
2. CPU kernel gradient
3. GPU kernel gradient (if supported)
Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as the reference value.
The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps:
- 1. calculate the numeric gradient.
- 2. calculate CPU kernel Gradient with the backward Operator and compare it with the numeric gradient.
- 3. calculate GPU kernel Gradient with the backward Operator and compare it with the numeric gradient.(if support GPU)
1. calculate the numerical gradient
2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient
3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported)
#### Python Interface
......@@ -110,8 +110,8 @@ Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as
:param forward_op: used to create backward_op
:param input_vars: numpy value of input variable. The following
computation will use these variables.
:param inputs_to_check: inputs var names that should check gradient.
:param output_name: output name that used to
:param inputs_to_check: the input variable with respect to which to compute the gradient.
:param output_name: The final output variable name.
:param max_relative_error: The relative tolerance parameter.
:param no_grad_set: used when create backward ops
:param only_cpu: only compute and check gradient on cpu kernel.
......@@ -120,24 +120,24 @@ Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as
```
### How to check if two numpy array is close enough?
if `abs_numeric_grad` is nearly zero, then use abs error for numeric_grad, not relative
if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad
```python
numeric_grad = ...
numerical_grad = ...
operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
abs_numeric_grad = numpy.abs(numeric_grad)
# if abs_numeric_grad is nearly zero, then use abs error for numeric_grad, not relative
abs_numerical_grad = numpy.abs(numerical_grad)
# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative
# error.
abs_numeric_grad[abs_numeric_grad < 1e-3] = 1
abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
diff_mat = numpy.abs(abs_numeric_grad - operator_grad) / abs_numeric_grad
diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
max_diff = numpy.max(diff_mat)
```
#### Notes:
1,The Input data for auto gradient checker should be reasonable to avoid numeric problem.
The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
#### Refs:
......
# Design Doc: Block and Scope
## The Representation of Computation
Both deep learning systems and programming languages help users describe computation procedures. These systems use various representations of computation:
- Caffe, Torch, and Paddle: sequences of layers.
- TensorFlow, Caffe2, Mxnet: graphs of operators.
- PaddlePaddle: nested blocks, like C++ and Java programs.
## Block in Programming Languages and Deep Learning
In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions, or operators.
Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
| programming languages | PaddlePaddle |
|-----------------------|-----------------------|
| for, while loop | RNN, WhileOp |
| if, if-else, switch | IfElseOp, SwitchOp |
| sequential execution | a sequence of layers |
A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
## Stack Frames and the Scope Hierarchy
The existence of the backward makes the execution of a block of traditional programs and PaddlePaddle different to each other:
| programming languages | PaddlePaddle |
|-----------------------|-------------------------------|
| stack | scope hierarchy |
| stack frame | scope |
| push at entering block| push at entering block |
| pop at leaving block | destroy at minibatch completes|
1. In traditional programs:
- When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables.
- After the execution leaves the right curly brace, the runtime pops the frame.
- The maximum number of frames in the stack is the maximum depth of nested blocks.
1. In PaddlePaddle
- When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
- PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are to be used by the backward pass. So it has a stack forest known as a *scope hierarchy*.
- The height of the highest tree is the maximum depth of nested blocks.
- After the process of a minibatch, PaddlePaddle destroys the scope hierarchy.
## Use Blocks in C++ and PaddlePaddle Programs
Let us consolidate the discussion by presenting some examples.
### Blocks with `if-else` and `IfElseOp`
The following C++ programs shows how blocks are used with the `if-else` structure:
```c++
int x = 10;
int y = 20;
int out;
bool cond = false;
if (cond) {
int z = x + y;
out = softmax(z);
} else {
int z = fc(x);
out = z;
}
```
An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
```python
import paddle as pd
x = var(10)
y = var(20)
cond = var(false)
ie = pd.create_ifelseop(inputs=[x], output_num=1)
with ie.true_block():
x = ie.inputs(true, 0)
z = operator.add(x, y)
ie.set_output(true, 0, operator.softmax(z))
with ie.false_block():
x = ie.inputs(false, 0)
z = layer.fc(x)
ie.set_output(true, 0, operator.softmax(z))
out = b(cond)
```
In both examples, the left branch computes `softmax(x+y)` and the right branch computes `fc(x)`.
A difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances. The `ie.input(true, 0)` invocation returns instances in the 0-th input, `x`, that corresponds to true values in `cond` as the local variable `x`, where `ie.input(false, 0)` returns instances corresponding to false values.
### Blocks with `for` and `RNNOp`
The following RNN model from the [RNN design doc](./rnn.md)
```python
x = sequence([10, 20, 30])
m = var(0)
W = tensor()
U = tensor()
rnn = create_rnn(inputs=[input])
with rnn.stepnet() as net:
x = net.set_inputs(0)
h = net.add_memory(init=m)
fc_out = pd.matmul(W, x)
hidden_out = pd.matmul(U, h.pre(n=1))
sum = pd.add_two(fc_out, hidden_out)
act = pd.sigmoid(sum)
h.update(act) # update memory with act
net.set_outputs(0, act, hidden_out) # two outputs
o1, o2 = rnn()
print o1, o2
```
has its equivalent C++ program as follows
```c++
int* x = {10, 20, 30};
int m = 0;
int W = some_value();
int U = some_other_value();
int mem[sizeof(x) / sizeof(x[0]) + 1];
int o1[sizeof(x) / sizeof(x[0]) + 1];
int o2[sizeof(x) / sizeof(x[0]) + 1];
for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
int x = x[i-1];
if (i == 1) mem[0] = m;
int fc_out = W * x;
int hidden_out = Y * mem[i-1];
int sum = fc_out + hidden_out;
int act = sigmoid(sum);
mem[i] = act;
o1[i] = act;
o2[i] = hidden_out;
}
print_array(o1);
print_array(o2);
```
## Compilation and Execution
Like TensorFlow programs, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest part executes the message for training or inference.
The generation of this protobuf message is like what a compiler generates a binary executable file. The execution of the message that the OS executes the binary file.
## The "Binary Executable File Format"
The definition of the protobuf message is as follows:
```protobuf
message BlockDesc {
repeated VarDesc vars = 1;
repeated OpDesc ops = 2;
}
```
The step net in above RNN example would look like
```
BlockDesc {
vars = {
VarDesc {...} // x
VarDesc {...} // h
VarDesc {...} // fc_out
VarDesc {...} // hidden_out
VarDesc {...} // sum
VarDesc {...} // act
}
ops = {
OpDesc {...} // matmul
OpDesc {...} // add_two
OpDesc {...} // sigmoid
}
};
```
Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like:
```
OpDesc {
inputs = {0} // the index of x
outputs = {5, 3} // indices of act and hidden_out
attrs {
"memories" : {1} // the index of h
"step_net" : <above step net>
}
};
```
This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block.
## The Compilation of Blocks
During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope.
Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example
```python
a = pd.Varaible(shape=[20, 20])
b = pd.fc(a, params=["fc.w", "fc.b"])
rnn = pd.create_rnn()
with rnn.stepnet() as net:
x = net.set_inputs(a)
# reuse fc's parameter
fc_without_b = pd.get_variable("fc.w")
net.set_outputs(fc_without_b)
out = rnn()
```
the method `pd.get_variable` can help retrieve a Variable by a name, a Variable may store in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.
To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.
`SymbolTable` can do the following stuff:
- store the definitions (some names and attributes) of variables and operators,
- to verify if a variable was declared,
- to make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
```c++
// Information in SymbolTable is enough to trace the dependency graph. So maybe
// the Eval() interface takes a SymbolTable is enough.
class SymbolTable {
public:
SymbolTable(SymbolTable* parent) : parent_(parent) {}
OpDesc* NewOp(const string& name="");
// TODO determine whether name is generated by python or C++
// currently assume that a unique name will be generated by C++ if the
// argument name left default.
VarDesc* NewVar(const string& name="");
// find a VarDesc by name, if recursive true, find parent's SymbolTable
// recursively.
// this interface is introduced to support InferShape, find protobuf messages
// of variables and operators, pass pointers into InferShape.
// operator
//
// NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
// be proposed and embedded into pybind to enable python operate on C++ pointers.
VarDesc* FindVar(const string& name, bool recursive=true);
OpDesc* FindOp(const string& name);
BlockDesc Compile() const;
private:
SymbolTable* parent_;
map<string, OpDesc> ops_;
map<string, VarDesc> vars_;
};
```
After all the description of variables and operators is added into SymbolTable,
the block has enough information to run.
The `Block` class takes a `BlockDesc` as input, and provide `Run` and `InferShape` functions.
```c++
namespace {
class Block : OperatorBase {
public:
Block(const BlockDesc& desc) desc_(desc) {}
void InferShape(const framework::Scope& scope) const override {
if (!symbols_ready_) {
CreateVariables(scope);
CreateOperators();
}
// should run InferShape first.
for (auto& op : runtime_table_.ops()) {
op->InferShape(scope);
}
}
void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const override {
PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
for (auto& op : runtime_table_.ops()) {
op->Run(scope, dev_ctx);
}
}
void CreateVariables(const framework::Scope& scope);
void CreateOperators();
// some other necessary interfaces of NetOp are list below
// ...
private:
BlockDesc desc_;
bool symbols_ready_{false};
};
```
## The Execution of Blocks
Block inherits from OperatorBase, which has a Run method.
Block's Run method will run its operators sequentially.
There is another important interface called `Eval`, which take some arguments called targets, and generate a minimal graph which takes targets as the end points and creates a new Block,
after `Run`, `Eval` will get the latest value and return the targets.
The definition of Eval is as follows:
```c++
// clean a block description by targets using the corresponding dependency graph.
// return a new BlockDesc with minimal number of operators.
// NOTE not return a Block but the block's description so that this can be distributed
// to a cluster.
BlockDesc Prune(const BlockDesc& desc, vector<string> targets);
void Block::Eval(const vector<string>& targets,
const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) {
BlockDesc min_desc = Prune(desc_, targets);
Block min_block(min_desc);
min_block.Run(scope, dev_ctx);
}
```
......@@ -53,12 +53,12 @@ Let's explain using an example. Suppose that we are going to compose the FC usi
```python
def operator.mul(X1, X2):
O = Var()
paddle.cpp.create_operator("mul", input={X1, Y1], output=O)
paddle.cpp.create_operator("mul", input={X1, Y1}, output=O)
return O
def operator.add(X1, X2):
O = Var()
paddle.cpp.create_operator("add", input={X1, X2], output=O)
paddle.cpp.create_operator("add", input={X1, X2}, output=O)
return O
```
......
......@@ -56,7 +56,7 @@ For each parameter, like W and b created by `layer.fc`, marked as double circles
## Block and Graph
The word block and graph are interchangable in the desgin of PaddlePaddle. A [Block[(https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions. A graph of operators and variables is a representation of the block.
The word block and graph are interchangable in the desgin of PaddlePaddle. A [Block](https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions. A graph of operators and variables is a representation of the block.
A Block keeps operators in an array `BlockDesc::ops`
......@@ -67,4 +67,4 @@ message BlockDesc {
}
```
in the order that there appear in user programs, like the Python program at the beginning of this article. We can imagine that in `ops`, we have some forward operators, followed by some gradient operators, and then some optimization operators.
in the order that they appear in user programs, like the Python program at the beginning of this article. We can imagine that in `ops`, we have some forward operators, followed by some gradient operators, and then some optimization operators.
IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has M (M<=N) instances, each corresponds to a true element in `cond`.
```python
import paddle as pd
x = var()
y = var()
cond = var()
b = pd.create_ifop(inputs=[x], output_num=1)
with b.true_block():
x = b.inputs(0)
z = operator.add(x, y)
b.set_output(0, operator.softmax(z))
out = b(cond)
```
If we want the output still has N instances, we can use IfElseOp with a default value, whose minibatch size must be N:
IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has N instances. If cond[i] == True, input instance input[i] will go through true_block() and generate output[i]; otherwise it will produce output from false_bloack().
```python
import paddle as pd
......@@ -39,7 +21,7 @@ with b.false_block():
out = b(cond)
```
If only true_block is set in an IfElseOp, we can have a default value for false as:
If only true_block is set in an IfElseOp, a special case is that we can have a default value for false as:
```python
import paddle as pd
......
digraph G {
rnn [label="1-th level RNN" shape=box]
subgraph cluster0 {
label = "time step 0"
sent0 [label="sentence"]
sent1 [label="sentence"]
rnn1 [label="2-th level RNN" shape=box]
sent0 -> rnn1
sent1 -> rnn1
}
subgraph cluster1 {
label = "time step 1"
sent2 [label="sentence"]
sent3 [label="sentence"]
rnn2 [label="2-th level RNN" shape=box]
sent2 -> rnn2
sent3 -> rnn2
}
subgraph cluster2 {
label = "time step 2"
sent4 [label="sentence"]
sent5 [label="sentence"]
rnn3 [label="2-th level RNN" shape=box]
sent4 -> rnn3
sent5 -> rnn3
}
para0 [label="paragraph info 0"]
para1 [label="paragraph info 1"]
para2 [label="paragraph info 2"]
rnn1 -> para0
rnn2 -> para1
rnn3 -> para2
para0 -> rnn
para1 -> rnn
para2 -> rnn
chapter [label="chapter info"]
rnn -> chapter
}
digraph G {
label = "simple RNN implementation"
ranksep=2;
//graph [nodesep=1, ranksep=1];
node[nodesep=1]
subgraph cluster0 {
label = "global scope"
rankdir = TB
W
boot_memory
input
output
}
subgraph cluster1 {
label = "step-scope 0"
rankdir = TB
memory0[label="memory"]
prememory0[label="pre-memory"]
step_input0[label="step input"]
step_output0[label="step output"]
}
subgraph cluster2 {
label = "step-scope 1"
rankdir = TB
memory1[label="memory"]
prememory1[label="pre-memory"]
step_input1[label="step input"]
step_output1[label="step output"]
}
subgraph cluster3 {
label = "step-scope 2"
rankdir = TB
memory2[label="memory"]
prememory2[label="pre-memory"]
step_input2[label="step input"]
step_output2[label="step output"]
}
stepnet [shape=box]
stepnet0 [shape=box, style=dashed]
stepnet1 [shape=box, style=dashed]
stepnet2 [shape=box, style=dashed]
edge[color=blue]
boot_memory -> prememory0 [label="init" color="blue"]
memory0 -> prememory1 [label="copy/reference" color="blue"]
memory1 -> prememory2 [label="copy/reference" color="blue"]
edge[color=black]
W -> stepnet0[constraint=false, style=dashed]
W -> stepnet1[constraint=false, style=dashed]
W -> stepnet2[constraint=false, style=dashed]
memory0 -> stepnet0[style=dashed]
prememory0 -> stepnet0 -> step_output0[style=dashed]
memory1 -> stepnet1[style=dashed]
prememory1 -> stepnet1 -> step_output1[style=dashed]
memory2 -> stepnet2[style=dashed]
prememory2 -> stepnet2 -> step_output2[style=dashed]
input -> step_input0
input -> step_input1
input -> step_input2
step_input0 -> stepnet0 [style=dashed]
step_input1 -> stepnet1[style=dashed]
step_input2 -> stepnet2[style=dashed]
step_output0 -> output
step_output1 -> output
step_output2 -> output
stepnet0 -> stepnet[style=dashed]
stepnet1 -> stepnet[style=dashed]
stepnet2 -> stepnet[style=dashed]
}
digraph G {
chapter [label="chapter"]
subgraph cluster0 {
label = "paragraph 0"
top_rnn0[label="top rnn step 0" shape=box]
p0 [label="paragraph 0"]
p1 [label="paragraph 1"]
}
subgraph cluster1{
label = "paragraph 1"
top_rnn1[label="top rnn step 1" shape=box]
p2 [label="paragraph 0"]
p3 [label="paragraph 1"]
}
subgraph cluster_p0 {
label = "sentence 0"
low_rnn0 [label="low rnn step 0" shape=box]
s00 [label="sentence 0"]
s01 [label="sentence 1"]
low_rnn0 -> s00
low_rnn0 -> s01
}
subgraph cluster_p1 {
label = "sentence 1"
low_rnn1 [label="low rnn step 1" shape=box]
s10 [label="sentence 0"]
s11 [label="sentence 1"]
low_rnn1 -> s10
low_rnn1 -> s11
}
subgraph cluster_p2 {
label = "sentence 1"
low_rnn2 [label="low rnn step 0" shape=box]
s20 [label="sentence 0"]
s21 [label="sentence 1"]
low_rnn2 -> s20
low_rnn2 -> s21
}
subgraph cluster_p3 {
label = "sentence 1"
low_rnn3 [label="low rnn step 1" shape=box]
s30 [label="sentence 0"]
s31 [label="sentence 1"]
low_rnn3 -> s30
low_rnn3 -> s31
}
chapter -> top_rnn0
chapter -> top_rnn1
top_rnn0 -> p0
top_rnn0 -> p1
top_rnn1 -> p2
top_rnn1 -> p3
p0 -> low_rnn0
p1 -> low_rnn1
p2 -> low_rnn2
p3 -> low_rnn3
}
# RNNOp design
This document is about an RNN operator which requires that instances in a mini-batch have the same length. We will have a more flexible RNN operator.
## RNN Algorithm Implementation
<p aligh="center">
<img src="./images/rnn.jpg"/>
</p>
The above diagram shows an RNN unrolled into a full network.
There are several important concepts:
- *step-net*: the sub-graph to run at each step,
- *memory*, $h_t$, the state of the current step,
- *ex-memory*, $h_{t-1}$, the state of the previous step,
- *initial memory value*, the ex-memory of the first step.
### Step-scope
There could be local variables defined in step-nets. PaddlePaddle runtime realizes these variables in *step-scopes* -- scopes created for each step.
<p aligh="center">
<img src="./images/rnn.png"/><br/>
Figure 2 the RNN's data flow
</p>
Please be aware that all steps run the same step-net. Each step
1. creates the step-scope,
2. realizes local variables, including step-outputs, in the step-scope, and
3. runs the step-net, which could use these variables.
The RNN operator will compose its output from step outputs in step scopes.
### Memory and Ex-memory
Let's give more details about memory and ex-memory via a simply example:
$$
h_t = U h_{t-1} + W x_t
$$,
where $h_t$ and $h_{t-1}$ are the memory and ex-memory of step $t$'s respectively.
In the implementation, we can make an ex-memory variable either "refers to" the memory variable of the previous step,
or copy the value of the previous memory value to the current ex-memory variable.
### Usage in Python
For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
We can define an RNN's step-net using Block:
```python
import paddle as pd
X = some_op() # x is some operator's output, and is a LoDTensor
a = some_op()
# declare parameters
W = pd.Variable(shape=[20, 30])
U = pd.Variable(shape=[20, 30])
rnn = pd.create_rnn_op(output_num=1)
with rnn.stepnet():
x = rnn.add_input(X)
# declare a memory (rnn's step)
h = rnn.add_memory(init=a)
# h.pre_state() means previous memory of rnn
new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
# update current memory
h.update(new_state)
# indicate that h variables in all step scopes should be merged
rnn.add_outputs(h)
out = rnn()
```
Python API functions in above example:
- `rnn.add_input` indicates the parameter is a variable that will be segmented into step-inputs.
- `rnn.add_memory` creates a variable used as the memory.
- `rnn.add_outputs` mark the variables that will be concatenated across steps into the RNN output.
### Nested RNN and LoDTensor
An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences.
The following figure illustrates the feeding of text into the lower level, one sentence each step, and the feeding of step outputs to the top level. The final top level output is about the whole text.
<p aligh="center">
<img src="./images/2_level_rnn.png"/>
</p>
```python
import paddle as pd
W = pd.Variable(shape=[20, 30])
U = pd.Variable(shape=[20, 30])
W0 = pd.Variable(shape=[20, 30])
U0 = pd.Variable(shape=[20, 30])
# a is output of some op
a = some_op()
# chapter_data is a set of 128-dim word vectors
# the first level of LoD is sentence
# the second level of LoD is chapter
chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
def lower_level_rnn(paragraph):
'''
x: the input
'''
rnn = pd.create_rnn_op(output_num=1)
with rnn.stepnet():
sentence = rnn.add_input(paragraph, level=0)
h = rnn.add_memory(shape=[20, 30])
h.update(
pd.matmul(W, sentence) + pd.matmul(U, h.pre_state()))
# get the last state as sentence's info
rnn.add_outputs(h)
return rnn
top_level_rnn = pd.create_rnn_op(output_num=1)
with top_level_rnn.stepnet():
paragraph_data = rnn.add_input(chapter_data, level=1)
low_rnn = lower_level_rnn(paragraph_data)
paragraph_out = low_rnn()
h = rnn.add_memory(init=a)
h.update(
pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
top_level_rnn.add_outputs(h)
# just output the last step
chapter_out = top_level_rnn(output_all_steps=False)
```
in above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is a LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
By default, the `RNNOp` will concatenate the outputs from all the time steps,
if the `output_all_steps` set to False, it will only output the final time step.
<p align="center">
<img src="images/rnn_2level_data.png"/>
</p>
# Design Doc: The C++ Class `Parameters`
`Parameters` is a concept we designed in Paddle V2 API. `Parameters` is a container of parameters, and make Paddle can shared parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
`Parameters` is a concept we designed in PaddlePaddle V2 API. `Parameters` is a container of parameters, which makes PaddlePaddle capable of sharing parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
We used Python to implement Parameters when designing V2 API before. There are several defects for current implementation:
We used Python to implement Parameters when designing V2 API before. There are several defects for the current implementation:
* We just use `memcpy` to share Parameters between topologies, but this is very inefficient.
* We did not implement share Parameters while training. We just trigger `memcpy` when start training.
* We did not support sharing Parameters while training. We just trigger `memcpy` when start training.
It is necessary that we implement Parameters in CPP side. However, it could be a code refactoring for Paddle, because Paddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current Paddle implementation, there are three concepts associated with `Parameters`:
It is necessary that we implement Parameters in CPP side. However, it could result a code refactoring for PaddlePaddle, because PaddlePaddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current PaddlePaddle implementation, there are three concepts associated with `Parameters`:
1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`.
It is evident that we should use `paddle::Parameter` when developing `Parameters`.
However, the `Parameter` class contains many functions and does not have a clear interface.
It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`.
When we developing `Parameters`, we only use `create/store Parameter` functionality.
We should extract functionalities of Parameter into many classes to clean Paddle CPP implementation.
We should extract functionalities of Parameter into many classes to clean PaddlePaddle CPP implementation.
2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`.
We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies.
......@@ -24,7 +24,7 @@ Also, we should handle multi-GPU/CPU training, because `forward` and `backward`
So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD).
The step by step approach for implementation Parameters in Paddle C++ core is listed below. Each step should be a PR and could be merged into Paddle one by one.
The step by step approach for implementation Parameters in PaddlePaddle C++ core is listed below. Each step should be a PR and could be merged into PaddlePaddle one by one.
1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters.
......
# Design Doc: ProgramDesc
The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
As described in [graph.md](./graph.md), the first five lines of the following PaddlePaddle program
```python
x = layer.data("images")
l = layer.data("label")
y = layer.fc(x)
cost = layer.mse(y, l)
optimize(cost)
train(cost, reader=mnist.train())
```
generates, or compiles, a PaddelPaddle program, which is represented by the following protobuf message:
```protobuf
message ProgramDesc {
repeated BlockDesc blocks = 1;
}
message BlockDesc {
required int32 parent = 1;
repeated VarDesc vars = 2;
repeated OpDesc ops = 3;
}
message OpDesc {
AttrDesc attrs = 1;
...
}
message AttrDesc {
required AttrType type = 1;
// index into ProgramDesc::blocks when type==BLOCK
optional int32 block = 2;
...
}
```
When each of the first five lines runs, related Python function, e.g., `layer.fc`, calls C++ InferShape functions. This InferShape function needs to access the properties of VarDesc's accessed by the current OpDesc. These VarDesc's might not be defined in the current block, but in some ancestor blocks. This requires that we can trace the parent of a block.
A nested block is often an attribute of an operator, most likely, an IfElseOp or a WhileOp. In above solution, all blocks are in `ProgramDesc::blocks`, this implicitly assigns a zero-based ID to each block -- the index of the block in `ProgramDesc::blocks`. So that `AttrDesc::block` could be an integer block ID.
With this design, the InferShape function should take the following parameters:
```c++
void InferShape(int current_block,
int current_operator,
ProgramDesc* program // might change VarDesc values.
) {
...
}
```
where
- `current_block` indices into `ProgramDesc::blocks`,
- `current_operator` indices into `BlockDesc::ops`.
# Design Doc: Python API
Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
| Python classes | Protobuf messages |
| --- | --- |
| Program | ProgramDesc |
| Block | BlockDesc |
| Operator | OpDesc |
| Variable | VarDesc |
Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
## Core Concepts
### Program
A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array. For example, operators in the step block of an RNN operator needs to be able to access variables in its ancessor blocks.
Whenever we create a block, we need set its parent block to the current block, so the Python class `Program` needs to maintain a data member `current_block`.
```python
class Program(objects):
def __init__(self):
self.proto = core.NewProgram() # a C++ ProgramDesc pointer.
self.blocks = vector<Block>()
self.blocks.append(Block(self, -1)) # the global block
self.current_block = 0 # initialized to the global block
def global_block():
return self.blocks[0]
def current_block():
return self.get_block(self.current_block)
def rollback():
self.current_block = self.current_block().parent_idx
def create_block():
new_block_idx = len(self.block)
self.blocks.append(Block(self, self.current_block))
self.current_block = new_block_idx
return current_block()
```
`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
`Program` creates the first block as the global block in its constructor. All parameters and their initializer operators are in the global block.
### Block
A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
1. a map from variable names to an instance of the Python `Variable` class, and
1. a list of `Operator` instances.
```python
class Block(objects):
def __init__(self, program, parent_idx):
self.proto = core.NewBlock(program.proto)
self.program = program
self.vars = map<string, Variable>()
self.ops = vector<Operator>()
self.parent_idx = parent_idx
def create_var(self, ...):
return Variable(self, ...)
def _create_global_var(self, ...):
program.global_block().create_var(...)
def create_parameter(self, name, ...):
# Parameter is a subclass of variable. See Parameter section for details.
self.vars[name] = Parameter(self._create_global_var(...), ...)
return self.vars[name]
def append_operator(self, ...):
self.ops.append(Operator(self, ...))
def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
self.ops.prepend(Operator(self, ...))
```
`create_parameter` is necessary because parameters are global variables, those defined in the global block, but can be created in some sub-blocks, e.g., an FC layer in the step block of an RNN operator.
`prepand_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
### Operator
The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer output shape from input shape.
```python
class Operator(object):
def __init__(self,
block, # Block
type, # string
inputs, # dict<string, Variable>
outputs,# dict<stirng, Variable>
attrs # dict<string, Any>
):
self.proto = core.NewOpDesc(block.proto, type, inputs, outputs, attrs)
core.infer_shape(self.proto, inputs, outputs)
def type(self):
return self.proto.type()
```
`Operator` creates the `OpDesc` message in C++ space, so could it call the `InferShape` function, which is in C++.
### Variable
Operators take Variables as its inputs and outputs.
```python
class Variable(object):
def __init__(self,
block=None, # Block
name=None, # string
shape, # tuple
dtype="float32", # string
lod_level=None # int
):
if name is None:
name = unique_name_generator()
self.name = name
self.block = block
self.proto = core.NewVarDesc(block.proto, name, shape, lod_level)
self.writer = None
```
Please be aware of `self.writer`, that tracks operator who creates the variable. It possible that there are more than one operators who write a variable, but in Python space, each writes to a variable is represented by a Variable class. This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
### Parameter
A parameter is a global variable with an initializer (or load) operator.
```python
class Parameter(Variable):
def __init__(self,
block=None, # Block
name=None, # string
shape, # tuple
dtype="float32", # string
lod_level=None # int
trainable, # bool
initialize_op_attrs,
optimize_op_attrs):
super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
self.trainable = trainable
self.optimize_op_attrs = optimize_op_attrs
block.prepend(Operator(block, # Block
initialize_op_attrs['type'], # string
None, # no inputs
self, # output is the parameter
initialize_op_attrs)
```
When users create a parameter, s/he can call
```python
program.create_parameter(
...,
init_attr={
type: "uniform_random",
min: -1.0,
max: 1.0,
})
)
```
In above example, `init_attr.type` names an initialize operator. It can also name the load operator
```python
init_attr={
type: "load",
filename: "something.numpy",
}
```
`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
## Layer Functions
A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers.
### Data Layer
```python
def data_layer(name, type, column_name):
block = the_current_program.glolal_block()
var = block.create_global_var(
name=name,
shape=[None] + type.dims(),
dtype=type.dtype)
block.prepend_operator(block,
type="Feed",
inputs = None,
outputs = [var],
{column_name: column_name})
return var
```
The input to the feed operator is a special variable in the global scope, which is the output of [Python readers](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md).
### FC Layer
```python
def fc_layer(input, size, ...):
block = program.current_block()
w = block.create_parameter(...)
b = block.create_parameter(...)
out = block.create_var()
op = block.append_operator("FC", X=input, W=w, b=b, out=out)
out.writer = op
return out
```
......@@ -52,7 +52,7 @@ Here are valid outputs:
# a mini batch of three data items, each data item is a list (single column).
[([1,1,1],),
([2,2,2],),
([3,3,3],),
([3,3,3],)]
```
Please note that each item inside the list must be a tuple, below is an invalid output:
......
# Design Doc: Distributed Training Architecture
## Abstract
PaddlePaddle v0.10.0 uses the "trainer-parameter server"
architecture. We run multiple replicated instances of trainers (runs
the same code written by the user) and parameter servers for
distributed training. This architecture served us well, but has some
limitations:
1. Need to write special code to handle tasks which should only be run
by a single trainer. E.g., initializing model and saving model.
2. Model parallelism is hard: need to write if-else branches conditioned
on the trainer ID to partition model onto each trainer, and manually
write the inter-model-shard communication code.
3. The user can not directly specify the parameter update rule: need
to modify the parameter server C++ code and compile a new
binary. This adds complication for researchers: A lot of extra
effort is required. Besides, the training job submission program
may not allow running arbitrary binaries.
This design doc discusses PaddlePaddle's new distributed training
architecture that addresses the above limitations.
## Analysis
We will assume the user writes the trainer program by Python, the same
analysis holds if the trainer program is written in C++.
### Limitation 1
If we look at the Python code that the user writes, there are two
kinds of functionalities:
- The training logic such as load / save model and print log.
- The neural network definition such as the definition of the data
layer, the fully connected layer, the cost function and the
optimizer.
When we training with PaddlePaddle v0.10.0 distributedly, multiple
replicated Python instances are running on different nodes: both the
training logic and the neural network computation is replicated.
The tasks that should only run once all belong to the training logic,
if we only replicate the neural network computation, but do **not**
replicate the training logic, the limitation could be solved.
### Limitation 2
Model parallelism means running a single model on multiple nodes by
partitioning the model onto different nodes and managing the
inter-model-shard communications.
PaddlePaddle should be able to modify the nerual network computation
definition to support model parallelism automatically. However, the
computation is only specified in Python code, and PaddlePaddle can not
modify Python code.
Just like compiler uses a intermediate representation (IR) so that
programmer does not need to manually optimize their code in most of
the cases - the compiler will optimize the IR:
<img src="src/compiler.png"/>
We can have our own IR too: PaddlePaddle can support model parallel by
converting the IR so the user no longer need to manually do it in
Python:
<img src="src/paddle-compile.png"/>
The IR for PaddlePaddle after refactor is called `Block`, it specifies
the computation dependency graph and the variables used in the
computation.
### Limitation 3
The user can not directly specify the parameter update rule for the
parameter server because the parameter server does not use the same
computation definition as the trainer. Instead, the update rule is
baked in the parameter server. The user can not specify the update
rule in the same way of specifying the trainer computation.
This could be fixed by making the parameter server run the same
computation definition as the trainer. For a detailed explanation,
please
see
[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
## Distributed Training Architecture
The new distributed training architecture can address the above
limitations. Below is the illustration:
<img src="src/distributed_architecture.png"/>
The architecture includes major components: *PaddlePaddle Python*,
*PaddlePaddle converter* and *PaddlePaddle runtime*:
### PaddlePaddle Python
PaddlePaddle Python is the Python library that user's Python trainer
invoke to build the neural network topology, start training, etc.
```Python
paddle.init()
input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster
img, label = input[0], input[1]
hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh())
prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(input=prediction, label=label)
optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01)
session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1)
for i in range(1000):
_, cost_val = session.eval(targets=[cost, optimizer])
print cost_val
```
The code above is a typical Python trainer code, the neural network
topology is built using helper functions such as
`paddle.layer.fc`. The training is done by calling `session.eval`
iteratively.
#### session.eval
As shown in the graph, `session.eval` sends the IR and the evaluation
inputs/targets to the PaddlePaddle cluster for evaluation. The
targets can be any variable in the computation graph. When the target
is the `optimizer` variable, the neural network will be optimized
once. When the target is the `cost` variable, `session.eval` returns
the cost value.
The Python `session` is a wrapper of the C++ `Session` class. For more
information about `Session`, please
see [Design Doc: Session](./session.md).
### PaddlePaddle Converter
PaddlePaddle converter automatically converts the IR in the request
(IR and evaluation inputs/targets) from PaddlePaddle Python to new
partitioned IRs and dispatch the new IRs and evaluation inputs/targets
to different PaddlePaddle runtimes. Below are the steps:
1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
fetches the eval targets to the IR.
1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
the boundary. The runtime does not need to run the OP that is not
dependent by the `fetch` OP.
1. Optimizes the computation graph.
1. Place the OPs in the graph onto different devices on different
PaddlePaddle runtime according to a placement algorithm and device
constraint specified by the user.
1. Partition the graph according to runtime boundaries and add `send` /
`recv` OP pair on the runtime boundaries.
1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
results back to the converter, the convert reports the evaluation
results back to the PaddlePaddle Python.
The output IRs will be cached to optimize the conversion latency.
#### Placement Algorithm
Our first implementation will only support "trainer-parameter server"
placement: the parameters, initializers, and optimizers are placed on
the PaddlePaddle runtimes with the parameter server role. And
everything else will be placed on the PaddlePaddle runtimes with the
trainer role. This has the same functionality of our
"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
is more general and flexible.
In the future, we will implement the general placement algorithm,
which makes placements according to the input IR, and a model of
device computation time and device communication time. Model
parallelism requires the general placement algorithm.
### PaddlePaddle Runtime
The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
runs the IR. The runtime does not need to do OP placement since it's
already done by the converter.
### Local Training Architecture
The local training architecture will be the same as the distributed
training architecture, the differences are everything runs locally,
and there is just one PaddlePaddle runtime:
<img src="src/local_architecture.png"/>
### Training Data
In PaddlePaddle v0.10.0, training data is typically read
with [data reader](../reader/README.md) from Python. This approach is
no longer efficient when training distributedly since the Python
process no longer runs on the same node with the trainer processes,
the Python reader will need to read from the distributed filesystem
(assuming it has the access) and send to the trainers, doubling the
network traffic.
When doing distributed training, the user can still use Python data
reader: the training data are sent with `session.eval`. However should
be used for debugging purpose only. The users are encouraged to use
the read data OPs.
## References:
[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)
# Design Doc: Refactorization Overview
The goals of refactoring include:
1. Making it easy for external contributors to write new elementary computation operations.
1. Making the codebase clean and readable.
1. Designing a new computation representation -- a computation graph of operators and variables.
1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs.
## Computation Graphs
1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example.
1. Users write Python programs to describe the graphs and run them (locally or remotely).
1. A graph is composed of *variables* and *operators*.
1. The description of graphs must be capable of being serialized/deserialized, so that:
1. It can to be sent to the cloud for distributed execution, and
1. It can be sent to clients for mobile or enterprise deployment.
1. The Python program does the following steps
1. *compilation*: run a Python program to generate a protobuf message representation of the graph and send it to
1. the C++ library `libpaddle.so` for local execution,
1. the master process of a distributed training job for training, or
1. the server process of a Kubernetes serving job for distributed serving.
1. *execution*: execute the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
## Description and Realization of Computation Graph
At compile time, the Python program generates a protobuf message representation of the graph, or the description of the graph.
At runtime, the C++ program realizes the graph and runs it.
| | Representation (protobuf messages) | Realization (C++ class objects) |
|---|---|---|
|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
|Block|BlockDesc|Block|
The word *graph* is interchangeable with *block* in this document. A graph represents computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
## Compilation and Execution
1. Run an application Python program to describe the graph. In particular, the Python application program does the following:
1. Create `VarDesc` to represent local/intermediate variables,
1. Create operators and set attributes,
1. Validate attribute values,
1. Infer the type and the shape of variables,
1. Plan memory-reuse for variables,
1. Generate the backward graph
1. Optimize the computation graph.
1. Potentially, split the graph for distributed training.
1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the application Python program does the following:
1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
1. realize local variables defined in the BlockDesc message in the new scope,
1. a scope is similar to the stack frame in programming languages,
1. Create an instance of class `Block`, in which,
1. realize operators in the BlockDesc message,
1. Run the Block by calling
1. `Block::Eval(vector<Variable>* targets)` for forward and backward computations, or
1. `Block::Eval(vector<Operator>* targets)` for optimization.
## Intermediate Representation (IR)
```text
Compile Time -> IR -> Runtime
```
### Benefits of IR
- Optimization
```text
Compile Time -> IR -> Optimized IR -> Runtime
```
- Automatically send partitioned IR to different nodes.
- Automatic Data Parallelism
```text
Compile Time
|-> Single GPU IR
|-> [trainer-IR-0, trainer-IR-1, pserver-IR]
|-> Node-0 (runs trainer-IR-0)
|-> Node-1 (runs trainer-IR-1)
|-> Node-2 (runs pserver-IR)
```
- Automatic Model Parallelism (planned for future)
---
# Operator/OpWithKernel/OpKernel
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot)
---
# Operator
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
* `Operator` is the fundamental building block of the user interface.
* Operator stores input/output variable names, and attributes.
* The `InferShape` interface is used to infer the shape of the output variable shapes based on the shapes of the input variables.
* Use `Run` to compute the `output` variables from the `input` variables.
---
# OpWithKernel/Kernel
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot)
* `OpWithKernel` inherits `Operator`.
* `OpWithKernel` contains a Kernel map.
* `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`.
* `OpKernelKey` is the map key. Only device place now, but may be data type later.
---
# Why separate Kernel and Operator
* Separate GPU and CPU code.
* Make Paddle capable of running without GPU.
* Make one operator (which is a user interface) and create many implementations.
* For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
---
# Libraries for Kernel development
* `Eigen::Tensor` contains basic math and element-wise functions.
* Note that `Eigen::Tensor` has broadcast implementation.
* Limit the number of `tensor.device(dev) = ` in your code.
* `thrust::transform` and `std::transform`.
* `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
* `thrust` also has more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
* Hand-writing `GPUKernel` and `CPU` code
* Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
---
# Operator Registration
## Why is registration necessary?
We need a method to build mappings between Op type names and Op classes.
## How is registration implemented?
Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
---
# The Registry Map
### `OpInfoMap`
`op_type(string)` -> `OpInfo`
`OpInfo`:
- **`creator`**: The Op constructor.
- **`grad_op_type`**: The type of the gradient Op.
- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes.
- **`checker`**: Used to check attributes.
---
# Related Concepts
### Op_Maker
It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
### Register Macros
```cpp
REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
```
### USE Macros
Make sure the registration process is executed and linked.
---
# Registration Process
1. Write an Op class and its gradient Op class, if required.
2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
3. Invoke the macro `REGISTER_OP`. This macro will
1. Call maker class to complete the `proto` and the `checker`
2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
4. Invoke the `USE` macro in which the Op is used, to make sure that it is linked.
---
# Backward Module (1/2)
### Create Backward Operator
- Mapping from forward Op to backward Op
![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
---
# Backward Module (2/2)
### Build Backward Network
- **Input**: graph of forward operators
- **Output**: graph of backward operators
- **Corner cases in construction**
- Shared Variables => insert an `Add` operator to combine gradients
- No Gradient => insert a `fill_zero_grad` operator
- Recursive NetOp => call `Backward` recursively
- RNN Op => recursively call `Backward` on stepnet
---
# Scope, Variable, Tensor
* `Tensor` is an n-dimension array with type.
* Only dims and data pointers are stored in `Tensor`.
* All operations on `Tensor` are written in `Operator` or global functions.
* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
* `Variable` instances are the inputs and the outputs of an operator. Not just `Tensor`.
* `step_scopes` in RNN is a variable and not a tensor.
* `Scope` is where variables are stores.
* map<string `variable_name`, Variable>
* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
---
# Block (in design)
## the difference between original RNNOp and Block
- As an operator is more intuitive than `RNNOp`,
- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
- Fits the compile-time/ runtime separation design paradigm.
- During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
- When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
---
# Milestone
- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
- Model migration
- Framework development gives **priority support** to model migration, for example,
- the MNIST demo needs a Python interface,
- the RNN models require the framework to support `LoDTensor`.
- Determine some timelines,
- Frequently used Ops need to be migrated first,
- Different models can be migrated in parallel.
- Improve the framework at the same time
- Accept imperfection, concentrate on solving the specific problem at the right price.
---
# Control the migration quality
- Compare the performance of migrated models with old ones.
- Follow the google C++ style
- Build the automatic workflow of generating Python/C++ documentations.
- The documentation of layers and ops should be written inside the code.
- Take the documentation quality into account when submitting pull requests.
- Preview the documentations, read and improve them from a user's perspective.
# Design Doc: Gradient Operators Registration
## The Problem Posed
In our current operator registration mechanism, for each operator, the programmer should register a *gradient operator creator* function, which takes a C++ operator instance, and returns the corresponding gradient instance.
However, as we decided to separate the *compilation* and *execution* of DL models, we need to reshape the creator to take a protobuf `OpDesc` message, and returns a corresponding message.
More than that, the new registration mechanism need to support the fact that an operators' gradient computation might be a composition of operators.
## Current Implementation
OpInfos store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is
```cpp
struct OpInfo {
std::function<OperatorBase*(...)> creator_;
std::string grad_op_type_;
...
};
map<string, OpInfo> OpInfoMap;
OperatorBase* CreateGradientOperator(const OperatorBase& op) {
return OpInfoMap.at(op.Type()).creator_(...);
}
```
## Proposed Solution
The mapping relationship between an operator and its gradient operators is a function. The interface of that function is:
```cpp
// (OpDesc) --> vector<OpDesc>
using GradOpDescMaker = std::function<std::vector<OpDesc>(const OpDesc&)>;
```
The function take a `OpDesc` of the forward operator and return one or many gradient operator descriptions.
The `GradOpDescMaker` will be registered in `OpInfo`, to replace `grad_op_type_` field. The `OpInfo` should be
```cpp
struct OpInfo {
GradOpDescMaker grad_op_maker_;
...
};
```
The `grad_op_maker_ ` is `nullptr` if the operator does not have associated gradient operators.
We should chagne register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
The user interface should be
```cpp
vector<OpDesc> MinusOpGradMaker(OpDesc) {...}
REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
// Developers can still manually implement gradient operator.
REGISTER_OPERATOR(minus_grad, MinusGradOp);
```
The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
```cpp
REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
```
# Paddle发行规范
# PaddlePaddle发行规范
Paddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示Paddle版本号。
PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
Paddle每次发新的版本,遵循以下流程:
PaddlePaddle每次发新的版本,遵循以下流程:
1.`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0`
2. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。
......@@ -27,14 +27,14 @@ Paddle每次发新的版本,遵循以下流程:
需要注意的是:
* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试Paddle的行为。
* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。
*`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop``release/版本号`这三个分支。
# Paddle 分支规范
# PaddlePaddle 分支规范
Paddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。
PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。
* Paddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
* `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试,但并没有经过回归测试。
* `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
......@@ -42,18 +42,18 @@ Paddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branch
* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,但所有fork的版本库的所有分支都相当于特性分支。
* 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
* 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。
* 当功能分支开发完毕后,向Paddle的主版本库提交`Pull Reuqest`,进而进行代码评审。
* 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。
* 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。
* BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master``develop`与可能有的`release/版本号`分支,同时提起`Pull Request`
# Paddle回归测试列表
# PaddlePaddle回归测试列表
本列表说明Paddle发版之前需要测试的功能点。
本列表说明PaddlePaddle发版之前需要测试的功能点。
## Paddle Book中所有章节
## PaddlePaddle Book中所有章节
Paddle每次发版本首先要保证Paddle Book中所有章节功能的正确性。功能的正确性包括验证Paddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
......
......@@ -17,7 +17,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
1. Scope only contains a map of a name to variable.
All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state(momentum) etc.
All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state (momentum) etc.
1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear.
......@@ -32,7 +32,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else.
Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be a invalid pointer when associated `Scope` is destroyed.
Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
```cpp
class Scope {
......@@ -50,7 +50,7 @@ class Scope {
Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope.
1. We can create local variables in a local scope. When that local scope are destroyed, all local variables should also be destroyed.
1. We can create local variables in a local scope. When that local scope is destroyed, all local variables should also be destroyed.
2. Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent.
```cpp
......@@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar
## Orthogonal interface
`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return a `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
......@@ -6,9 +6,9 @@ The Interaction between Python and C++ can be simplified as two steps:
1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time.
2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ fo finish Op construction task.
2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ for finishing the Op construction task.
### Message form C++ to Python
### Message from C++ to Python
We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.”
......
# Design for TensorArray
TensorArray as a new concept is borrowed from TensorFlow,
it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`.
This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers,
such as `RecurrentGradientMachine`.
In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401),
`TensorArray` is used to segment inputs and store states in all time steps.
By providing some methods similar to a C++ array,
the definition of some state-based dynamic models such as RNN could be more natural and highly flexible.
## Dynamic-Related Methods
Some basic methods should be proposed as follows:
### stack()
Pack the values in a `TensorArray` into a tensor with rank one higher than each tensor in `values`.
### unstack(axis=0)
Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
### concat()
Return the values in the `TensorArray` as a concatenated Tensor.
### write(index, value, data_shared=true)
Write value into index of the TensorArray.
### read(index)
Read the value at location `index` in the `TensorArray`.
### size()
Return the number of values.
## LoDTensor-related Supports
The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes variant length sequences as input,
because each step of RNN could only take a tensor-represented batch of data as input,
some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches.
Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`.
With these two methods, a variant-sentence-RNN can be implemented like
```c++
// input is the varient-length data
LodTensor sentence_input(xxx);
TensorArray ta;
Tensor indice_map;
Tensor boot_state = xxx; // to initialize rnn's first state
TensorArray::unpack(input, 1/*level*/, true/*sort_by_length*/, &ta, &indice_map);
TessorArray step_outputs;
TensorArray states;
for (int step = 0; step = ta.size(); step++) {
auto state = states.read(step);
// rnnstep is a function which acts like a step of RNN
auto step_input = ta.read(step);
auto step_output = rnnstep(step_input, state);
step_outputs.write(step_output, true/*data_shared*/);
}
// rnn_output is the final output of an rnn
LoDTensor rnn_output = ta.pack(ta, indice_map);
```
the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`,
the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
some details are as follows.
### unpack(level, sort_by_length)
Split LodTensor in some `level` and generate batches, if set `sort_by_length`, will sort by length.
Returns:
- a new `TensorArray`, whose values are LodTensors and represents batches of data.
- an int32 Tensor, which stores the map from the new batch's indices to original LoDTensor
### pack(level, indices_map)
Recover the original LoD-arranged LoDTensor with the values in a `TensorArray` and `level` and `indices_map`.
## Background
PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
PaddlePaddle use proto message to describe compile time graph for
PaddlePaddle use proto message to describe compile time graph because
1. Computation graph should be able to be saved to a file.
1. In distributed training, the graph will be serialized and send to multiple workers.
......
###################
编译安装与单元测试
###################
.. contents::
1. 运行Docker GPU镜像出现 "CUDA driver version is insufficient"
----------------------------------------------------------------
用户在使用PaddlePaddle GPU的Docker镜像的时候,常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。
具体的解决方法是:
.. code-block:: bash
$ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
$ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
$ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。
2. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
----------------------------------------------------------------
这是目前CMake寻找Python的逻辑存在缺陷,如果系统安装了多个Python版本,CMake找到的Python库和Python解释器版本可能有不一致现象,导致编译PaddlePaddle失败。正确的解决方法是,
用户强制指定特定的Python版本,具体操作如下:
.. code-block:: bash
cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path> -DPYTHON_INCLUDE_DIR=<inc_path>
用户需要指定本机上Python的路径:``<exc_path>``, ``<lib_path>``, ``<inc_path>``
3. CMake源码编译,Paddle版本号为0.0.0
--------------------------------------
如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`;或者运行 :code:`cmake ..`,出现
.. code-block:: bash
CMake Warning at cmake/version.cmake:20 (message):
Cannot add paddle version from git tag
那么用户需要拉取所有的远程分支到本机,命令为 :code:`git fetch upstream`,然后重新cmake即可。
4. paddlepaddle\*.whl is not a supported wheel on this platform.
------------------------------------------------------------------------
出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。最新的paddlepaddle python安装包支持Linux x86_64和MacOS 10.12操作系统,并安装了python 2.7和pip 9.0.1。
更新 :code:`pip` 包的方法是\:
.. code-block:: bash
pip install --upgrade pip
如果还不行,可以执行 :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` 获取当前系统支持的python包的后缀,
并对比是否和正在安装的后缀一致。
如果系统支持的是 :code:`linux_x86_64` 而安装包是 :code:`manylinux1_x86_64` ,需要升级pip版本到最新;
如果系统支持 :code:`manylinux1_x86_64` 而安装包(本地)是 :code:`linux_x86_64` ,可以重命名这个whl包为 :code:`manylinux1_x86_64` 再安装。
5. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
------------------------------------------------------------------------------------------
先查看一下是否曾经安装过paddle v1版本,有的话需要先卸载:
pip uninstall py_paddle paddle
然后安装paddle的python环境, 在build目录下执行
pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
6. 遇到“非法指令”或者是“illegal instruction”
--------------------------------------------
PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二进制发行版可能会导致这种错误,请选择正确的版本。
7. python相关的单元测试都过不了
--------------------------------
如果出现以下python相关的单元测试都过不了的情况:
.. code-block:: bash
24 - test_PyDataProvider (Failed)
26 - test_RecurrentGradientMachine (Failed)
27 - test_NetworkCompare (Failed)
28 - test_PyDataProvider2 (Failed)
32 - test_Prediction (Failed)
33 - test_Compare (Failed)
34 - test_Trainer (Failed)
35 - test_TrainerOnePass (Failed)
36 - test_CompareTwoNets (Failed)
37 - test_CompareTwoOpts (Failed)
38 - test_CompareSparse (Failed)
39 - test_recurrent_machine_generation (Failed)
40 - test_PyDataProviderWrapper (Failed)
41 - test_config_parser (Failed)
42 - test_swig_api (Failed)
43 - layers_test (Failed)
并且查询PaddlePaddle单元测试的日志,提示:
.. code-block:: bash
paddle package is already in your PYTHONPATH. But unittest need a clean environment.
Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
解决办法是:
* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包,使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面,单元测试会引用site-packages里面的python包,而不是源码目录里 :code:`/python` 目录下的python包。同时,即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用,因为python的搜索路径是优先已经安装的python包。
###############
集群训练与预测
###############
.. contents::
1. 集群多节点训练,日志中保存均为网络通信类错误
------------------------------------------------
集群多节点训练,日志报错为网络通信类错误,比如 :code:`Connection reset by peer` 等。
此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出,从而引发其他节点无法连接导致,可以参考下面的步骤排查:
* 从 :code:`train.log` , :code:`server.log` 找到最早报错的地方,查看是否是其他错误引发的报错(比如FPE,内存不足,磁盘空间不足等)。
* 如果发现最早的报错就是网络通信的问题,很有可能是非独占方式执行导致的端口冲突,可以联系OP,看当前MPI集群是否支持resource=full参数提交,如果支持增加此参数提交,并更换job 端口。
* 如果当前MPI集群并不支持任务独占模式,可以联系OP是否可以更换集群或升级当前集群。
####################
FAQ
####################
====
.. contents::
.. toctree::
:maxdepth: 1
1. 如何减少内存占用
---------------------------------
神经网络的训练本身是一个非常消耗内存和显存的工作,经常会消耗数10GB的内存和数GB的显存。
PaddlePaddle的内存占用主要分为如下几个方面\:
* DataProvider缓冲池内存(只针对内存)
* 神经元激活内存(针对内存和显存)
* 参数内存 (针对内存和显存)
* 其他内存杂项
其中,其他内存杂项是指PaddlePaddle本身所用的一些内存,包括字符串分配,临时变量等等,暂不考虑在内。
减少DataProvider缓冲池内存
++++++++++++++++++++++++++
PyDataProvider使用的是异步加载,同时在内存里直接随即选取数据来做Shuffle。即
.. graphviz::
digraph {
rankdir=LR;
数据文件 -> 内存池 -> PaddlePaddle训练
}
所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这
个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的,
那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
.. literalinclude:: src/reduce_min_pool_size.py
这样做可以极大的减少内存占用,并且可能会加速训练过程,详细文档参考 :ref:`api_pydataprovider2` 。
神经元激活内存
++++++++++++++
神经网络在训练的时候,会对每一个激活暂存一些数据,如神经元激活值等。
在反向传递的时候,这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系,
一是batch size,另一个是每条序列(Sequence)长度。所以,其实也是和每个mini-batch中包含
的时间步信息成正比。
所以做法可以有两种:
* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数,减小batch size可能会对训练结果产生影响。
* 减小序列的长度,或者直接扔掉非常长的序列。比如,一个数据集大部分序列长度是100-200,
但是突然有一个10000长的序列,就很容易导致内存超限,特别是在LSTM等RNN中。
参数内存
++++++++
PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需要使用不同大小的内存。
例如使用 :code:`adadelta` 算法,则需要使用等于权重参数规模大约5倍的内存。举例,如果参数保存下来的模型目录
文件为 :code:`100M`, 那么该优化算法至少需要 :code:`500M` 的内存。
可以考虑使用一些优化算法,例如 :code:`momentum`。
2. 如何加速PaddlePaddle的训练速度
---------------------------------
加速PaddlePaddle训练可以考虑从以下几个方面\:
* 减少数据载入的耗时
* 加速训练速度
* 利用分布式训练驾驭更多的计算资源
减少数据载入的耗时
++++++++++++++++++
使用\ :code:`pydataprovider`\ 时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。
:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。
.. literalinclude:: src/reduce_min_pool_size.py
同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法,将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话,会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里,在之后的 :code:`pass` 中,不会再从 :code:`python` 端读取数据,而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
加速训练速度
++++++++++++
PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 :code:`sparse_update=True`
这里使用简单的 :code:`word2vec` 训练语言模型距离,具体使用方法为\:
使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为\:
.. literalinclude:: src/word2vec_dataprovider.py
这个任务的配置为\:
.. literalinclude:: src/word2vec_config.py
利用更多的计算资源
++++++++++++++++++
利用更多的计算资源可以分为一下几个方式来进行\:
* 单机CPU训练
* 使用多线程训练。设置命令行参数 :code:`trainer_count`。
* 单机GPU训练
* 使用显卡训练。设置命令行参数 :code:`use_gpu`。
* 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。
* 多机训练
* 请参考 :ref:`cluster_train` 。
3. 遇到“非法指令”或者是“illegal instruction”
--------------------------------------------
PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二进制发行版可能会导致这种错误,请选择正确的版本。
4. 如何选择SGD算法的学习率
--------------------------
在采用sgd/async_sgd进行训练时,一个重要的问题是选择正确的learning_rate。如果learning_rate太大,那么训练有可能不收敛,如果learning_rate太小,那么收敛可能很慢,导致训练时间过长。
通常做法是从一个比较大的learning_rate开始试,如果不收敛,那减少学习率10倍继续试验,直到训练收敛为止。那么如何判断训练不收敛呢?可以估计出如果模型采用不变的输出最小的cost0是多少。
如果训练过程的的cost明显高于这个常数输出的cost,那么我们可以判断为训练不收敛。举一个例子,假如我们是三分类问题,采用multi-class-cross-entropy作为cost,数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass(或者更早)后,cost还大于这个数,那么可以认为训练不收敛,应该降低学习率。
5. 如何初始化参数
-----------------
默认情况下,PaddlePaddle使用均值0,标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式,PaddlePaddle目前提供两种参数初始化的方式\:
* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
比如设置一个全连接层的参数初始化方式和bias初始化方式,可以使用如下代码。
.. code-block:: python
hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
6. 如何共享参数
---------------
PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字的参数,会共享参数。设置参数的名字,可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式,是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。
简单的全连接网络,参数共享的配置示例为\:
.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
7. \*-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
------------------------------------------------------------------------
出现这个问题的主要原因是,系统编译wheel包的时候,使用的 :code:`wheel` 包是最新的,
而系统中的 :code:`pip` 包比较老。具体的解决方法是,更新 :code:`pip` 包并重新编译PaddlePaddle。
更新 :code:`pip` 包的方法是\:
.. code-block:: bash
pip install --upgrade pip
8. python相关的单元测试都过不了
--------------------------------
如果出现以下python相关的单元测试都过不了的情况:
.. code-block:: bash
24 - test_PyDataProvider (Failed)
26 - test_RecurrentGradientMachine (Failed)
27 - test_NetworkCompare (Failed)
28 - test_PyDataProvider2 (Failed)
32 - test_Prediction (Failed)
33 - test_Compare (Failed)
34 - test_Trainer (Failed)
35 - test_TrainerOnePass (Failed)
36 - test_CompareTwoNets (Failed)
37 - test_CompareTwoOpts (Failed)
38 - test_CompareSparse (Failed)
39 - test_recurrent_machine_generation (Failed)
40 - test_PyDataProviderWrapper (Failed)
41 - test_config_parser (Failed)
42 - test_swig_api (Failed)
43 - layers_test (Failed)
并且查询PaddlePaddle单元测试的日志,提示:
.. code-block:: bash
paddle package is already in your PYTHONPATH. But unittest need a clean environment.
Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
解决办法是:
* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包,使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面,单元测试会引用site-packages里面的python包,而不是源码目录里 :code:`/python` 目录下的python包。同时,即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用,因为python的搜索路径是优先已经安装的python包。
9. 运行Docker GPU镜像出现 "CUDA driver version is insufficient"
----------------------------------------------------------------
用户在使用PaddlePaddle GPU的Docker镜像的时候,常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。
具体的解决方法是:
.. code-block:: bash
$ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
$ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
$ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。
10. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
----------------------------------------------------------------
这是目前CMake寻找Python的逻辑存在缺陷,如果系统安装了多个Python版本,CMake找到的Python库和Python解释器版本可能有不一致现象,导致编译PaddlePaddle失败。正确的解决方法是,
用户强制指定特定的Python版本,具体操作如下:
.. code-block:: bash
cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path> -DPYTHON_INCLUDE_DIR=<inc_path>
用户需要指定本机上Python的路径:``<exc_path>``, ``<lib_path>``, ``<inc_path>``
11. CMake源码编译,Paddle版本号为0.0.0
--------------------------------------
如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`;或者运行 :code:`cmake ..`,出现
.. code-block:: bash
CMake Warning at cmake/version.cmake:20 (message):
Cannot add paddle version from git tag
那么用户需要拉取所有的远程分支到本机,命令为 :code:`git fetch upstream`,然后重新cmake即可。
12. A protocol message was rejected because it was too big
----------------------------------------------------------
如果在训练NLP相关模型时,出现以下错误:
.. code-block:: bash
[libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
可能的原因是:传给dataprovider的某一个args过大,一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似:
.. code-block:: python
src_dict = dict()
for line_count, line in enumerate(open(src_dict_path, "r")):
src_dict[line.strip()] = line_count
define_py_data_sources2(
train_list,
test_list,
module="dataprovider",
obj="process",
args={"src_dict": src_dict})
解决方案是:将字典的地址作为args传给dataprovider,然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为:
.. code-block:: python
define_py_data_sources2(
train_list,
test_list,
module="dataprovider",
obj="process",
args={"src_dict_path": src_dict_path})
完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
13. 如何指定GPU设备
-------------------
例如机器上有4块GPU,编号从0开始,指定使用2、3号GPU:
* 方式1:通过 `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_ 环境变量来指定特定的GPU。
.. code-block:: bash
env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
* 方式2:通过命令行参数 ``--gpu_id`` 指定。
.. code-block:: bash
paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
14. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
------------------------------------------------------------------------
Paddle二进制在运行时捕获了浮点数异常,只要出现浮点数异常(即训练过程中出现NaN或者Inf),立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。
主要原因包括两个方面:
* 训练过程中参数或者训练过程中的梯度尺度过大,导致参数累加,乘除等时候,导致了浮点数溢出。
* 模型一直不收敛,发散到了一个数值特别大的地方。
* 训练数据有问题,导致参数收敛到了一些奇异的情况。或者输入数据尺度过大,有些特征的取值达到数百万,这时进行矩阵乘法运算就可能导致浮点数溢出。
主要的解决办法是减小学习律或者对数据进行归一化处理。
15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
------------------------------------------------------------------------
先查看一下是否曾经安装过paddle v1版本,有的话需要先卸载:
pip uninstall py_paddle paddle
然后安装paddle的python环境, 在build目录下执行
pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
build_and_install/index_cn.rst
model/index_cn.rst
parameter/index_cn.rst
local/index_cn.rst
cluster/index_cn.rst
###############
本地训练与预测
###############
.. contents::
1. 如何减少内存占用
-------------------
神经网络的训练本身是一个非常消耗内存和显存的工作,经常会消耗数10GB的内存和数GB的显存。
PaddlePaddle的内存占用主要分为如下几个方面\:
* DataProvider缓冲池内存(只针对内存)
* 神经元激活内存(针对内存和显存)
* 参数内存 (针对内存和显存)
* 其他内存杂项
其中,其他内存杂项是指PaddlePaddle本身所用的一些内存,包括字符串分配,临时变量等等,暂不考虑在内。
减少DataProvider缓冲池内存
++++++++++++++++++++++++++
PyDataProvider使用的是异步加载,同时在内存里直接随即选取数据来做Shuffle。即
.. graphviz::
digraph {
rankdir=LR;
数据文件 -> 内存池 -> PaddlePaddle训练
}
所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这
个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的,
那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
.. literalinclude:: src/reduce_min_pool_size.py
这样做可以极大的减少内存占用,并且可能会加速训练过程,详细文档参考 :ref:`api_pydataprovider2` 。
神经元激活内存
++++++++++++++
神经网络在训练的时候,会对每一个激活暂存一些数据,如神经元激活值等。
在反向传递的时候,这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系,
一是batch size,另一个是每条序列(Sequence)长度。所以,其实也是和每个mini-batch中包含
的时间步信息成正比。
所以做法可以有两种:
* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数,减小batch size可能会对训练结果产生影响。
* 减小序列的长度,或者直接扔掉非常长的序列。比如,一个数据集大部分序列长度是100-200,
但是突然有一个10000长的序列,就很容易导致内存超限,特别是在LSTM等RNN中。
参数内存
++++++++
PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需要使用不同大小的内存。
例如使用 :code:`adadelta` 算法,则需要使用等于权重参数规模大约5倍的内存。举例,如果参数保存下来的模型目录
文件为 :code:`100M`, 那么该优化算法至少需要 :code:`500M` 的内存。
可以考虑使用一些优化算法,例如 :code:`momentum`。
2. 如何加速训练速度
-------------------
加速PaddlePaddle训练可以考虑从以下几个方面\:
* 减少数据载入的耗时
* 加速训练速度
* 利用分布式训练驾驭更多的计算资源
减少数据载入的耗时
++++++++++++++++++
使用\ :code:`pydataprovider`\ 时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。
:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。
.. literalinclude:: src/reduce_min_pool_size.py
同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法,将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话,会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里,在之后的 :code:`pass` 中,不会再从 :code:`python` 端读取数据,而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
加速训练速度
++++++++++++
PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 :code:`sparse_update=True`
这里使用简单的 :code:`word2vec` 训练语言模型距离,具体使用方法为\:
使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为\:
.. literalinclude:: src/word2vec_dataprovider.py
这个任务的配置为\:
.. literalinclude:: src/word2vec_config.py
利用更多的计算资源
++++++++++++++++++
利用更多的计算资源可以分为一下几个方式来进行\:
* 单机CPU训练
* 使用多线程训练。设置命令行参数 :code:`trainer_count`。
* 单机GPU训练
* 使用显卡训练。设置命令行参数 :code:`use_gpu`。
* 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。
* 多机训练
* 请参考 :ref:`cluster_train` 。
3. 如何指定GPU设备
------------------
例如机器上有4块GPU,编号从0开始,指定使用2、3号GPU:
* 方式1:通过 `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_ 环境变量来指定特定的GPU。
.. code-block:: bash
env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
* 方式2:通过命令行参数 ``--gpu_id`` 指定。
.. code-block:: bash
paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
4. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
------------------------------------------------------------------------
Paddle二进制在运行时捕获了浮点数异常,只要出现浮点数异常(即训练过程中出现NaN或者Inf),立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。
主要原因包括两个方面:
* 训练过程中参数或者训练过程中的梯度尺度过大,导致参数累加,乘除等时候,导致了浮点数溢出。
* 模型一直不收敛,发散到了一个数值特别大的地方。
* 训练数据有问题,导致参数收敛到了一些奇异的情况。或者输入数据尺度过大,有些特征的取值达到数百万,这时进行矩阵乘法运算就可能导致浮点数溢出。
这里有两种有效的解决方法:
1. 设置 :code:`gradient_clipping_threshold` 参数,示例代码如下:
.. code-block:: python
optimizer = paddle.optimizer.RMSProp(
learning_rate=1e-3,
gradient_clipping_threshold=10.0,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
具体可以参考 `nmt_without_attention <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
2. 设置 :code:`error_clipping_threshold` 参数,示例代码如下:
.. code-block:: python
decoder_inputs = paddle.layer.fc(
act=paddle.activation.Linear(),
size=decoder_size * 3,
bias_attr=False,
input=[context, current_word],
layer_attr=paddle.attr.ExtraLayerAttribute(
error_clipping_threshold=100.0))
完整代码可以参考示例 `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。
两种方法的区别:
1. 两者都是对梯度的截断,但截断时机不同,前者在 :code:`optimzier` 更新网络参数时应用;后者在激活函数反向计算时被调用;
2. 截断对象不同:前者截断可学习参数的梯度,后者截断回传给前层的梯度;
除此之外,还可以通过减小学习律或者对数据进行归一化处理来解决这类问题。
5. 如何调用 infer 接口输出多个layer的预测结果
-----------------------------------------------
* 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入,代码如下:
.. code-block:: python
inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
* 指定要输出的字段进行输出。以输出 :code:`value` 字段为例,代码如下:
.. code-block:: python
out = inferer.infer(input=data_batch, field=["value"])
需要注意的是:
* 如果指定了2个layer作为输出层,实际上需要的输出结果是两个矩阵;
* 假设第一个layer的输出A是一个 N1 * M1 的矩阵,第二个 Layer 的输出B是一个 N2 * M2 的矩阵;
* paddle.v2 默认会将A和B 横向拼接,当N1 和 N2 大小不一样时,会报如下的错误:
.. code-block:: python
ValueError: all the input array dimensions except for the concatenation axis must match exactly
多个层的输出矩阵的高度不一致导致拼接失败,这种情况常常发生在:
* 同时输出序列层和非序列层;
* 多个输出层处理多个不同长度的序列;
此时可以在调用infer接口时通过设置 :code:`flatten_result=False` , 跳过“拼接”步骤,来解决上面的问题。这时,infer接口的返回值是一个python list:
* list 中元素的个数等于网络中输出层的个数;
* list 中每个元素是一个layer的输出结果矩阵,类型是numpy的ndarray;
* 每一个layer输出矩阵的高度,在非序列输入时:等于样本数;序列输入时等于:输入序列中元素的总数;宽度等于配置中layer的size;
#########
模型配置
#########
.. contents::
1. 出现 :code:`Duplicated layer name` 错误怎么办
--------------------------------------------------
出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时,先找出参数 :code:`name` 取值相同的layer,然后将这些layer的参数 :code:`name` 设置为不同的值。
2. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用
-------------------------------------------------------------
* :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出,该layer是通过参数 :code:`name` 指定,即,:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer,并将该layer上一时间步的输出作为自身当前时间步的输出。
* PaddlePaddle的所有layer都有唯一的name,用户通过参数 :code:`name` 设定,当用户没有显式设定时,PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer,其name由参数 :code:`memory_name` 设定,当用户没有显式设定时,PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer,需要用户显式设定。
3. 两种使用 drop_out 的方法有何区别
------------------------------------
* 在PaddlePaddle中使用dropout有两种方式
* 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`,以 :code:`paddle.layer.fc` 为例,代码如下:
.. code-block:: python
fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
* 使用 :code:`paddle.layer.dropout`,以 :code:`paddle.layer.fc` 为例,代码如下:
.. code-block:: python
fc = paddle.layer.fc(input=input)
drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
* :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`,并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。
* PaddlePaddle在激活函数里实现dropout,而不是在layer里实现。
* :code:`paddle.layer.lstmemory`、:code:`paddle.layer.grumemory`、:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活,所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout,可采用第二种方式,即使用 :code:`paddle.layer.dropout`。
4. 不同的 recurrent layer 的区别
----------------------------------
以LSTM为例,在PaddlePaddle中包含以下 recurrent layer:
* :code:`paddle.layer.lstmemory`
* :code:`paddle.networks.simple_lstm`
* :code:`paddle.networks.lstmemory_group`
* :code:`paddle.networks.bidirectional_lstm`
按照具体实现方式可以归纳为2类:
1. 由 recurrent_group 实现的 recurrent layer:
* 用户在使用这一类recurrent layer时,可以访问由recurrent unit在一个时间步内计算得到的中间值(例如:hidden states, memory cells等);
* 上述的 :code:`paddle.networks.lstmemory_group` 是这一类的 recurrent layer ;
2. 将recurrent layer作为一个整体来实现:
* 用户在使用这一类recurrent layer,只能访问它们的输出值;
* 上述的 :code:`paddle.networks.lstmemory_group` 、 :code:`paddle.networks.simple_lstm` 和 :code:`paddle.networks.bidirectional_lstm` 属于这一类的实现;
将recurrent layer作为一个整体来实现, 能够针对CPU和GPU的计算做更多优化, 所以相比于recurrent group的实现方式, 第二类 recurrent layer 计算效率更高。 在实际应用中,如果用户不需要访问LSTM的中间变量,而只需要获得recurrent layer计算的输出,我们建议使用第二类实现。
此外,关于LSTM, PaddlePaddle中还包含 :code:`paddle.networks.lstmemory_unit` 这一计算单元:
* 不同于上述介绍的recurrent layer , :code:`paddle.networks.lstmemory_unit` 定义了LSTM单元在一个时间步内的计算过程,它并不是一个完整的recurrent layer,也不能接收序列数据作为输入;
* :code:`paddle.networks.lstmemory_unit` 只能在recurrent_group中作为step function使用;
#########
参数设置
#########
.. contents::
1. 如何选择SGD算法的学习率
--------------------------
在采用sgd/async_sgd进行训练时,一个重要的问题是选择正确的learning_rate。如果learning_rate太大,那么训练有可能不收敛,如果learning_rate太小,那么收敛可能很慢,导致训练时间过长。
通常做法是从一个比较大的learning_rate开始试,如果不收敛,那减少学习率10倍继续试验,直到训练收敛为止。那么如何判断训练不收敛呢?可以估计出如果模型采用不变的输出最小的cost0是多少。
如果训练过程的的cost明显高于这个常数输出的cost,那么我们可以判断为训练不收敛。举一个例子,假如我们是三分类问题,采用multi-class-cross-entropy作为cost,数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass(或者更早)后,cost还大于这个数,那么可以认为训练不收敛,应该降低学习率。
2. 如何设置学习率退火(learning rate annealing)
------------------------------------------------
在相应的优化算法里设置learning_rate_schedule及相关参数,以使用Adam算法为例,代码如下:
.. code-block:: python
optimizer = paddle.optimizer.Adam(
learning_rate=1e-3,
learning_rate_decay_a=0.5,
learning_rate_decay_b=0.75,
learning_rate_schedule="poly",)
PaddlePaddle目前支持8种learning_rate_schedule,这8种learning_rate_schedule及其对应学习率计算方式如下:
* "constant"
lr = learning_rate
* "poly"
lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
其中,num_samples_processed为已训练样本数,下同。
* "caffe_poly"
lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
* "exp"
lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
* "discexp"
lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
* "linear"
lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
* "manual"
这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时,用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数,当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例,代码如下:
.. code-block:: python
optimizer = paddle.optimizer.Adam(
learning_rate=1e-3,
learning_rate_schedule="manual",
learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
在该示例中,当已训练样本数小于等于1000时,学习率为 :code:`1e-3 * 1.0`;当已训练样本数大于1000小于等于2000时,学习率为 :code:`1e-3 * 0.9`;当已训练样本数大于2000时,学习率为 :code:`1e-3 * 0.8`。
* "pass_manual"
这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时,用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数,当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例,代码如下:
.. code-block:: python
optimizer = paddle.optimizer.Adam(
learning_rate=1e-3,
learning_rate_schedule="manual",
learning_rate_args="1:1.0,2:0.9,3:0.8",)
在该示例中,当已训练pass数小于等于1时,学习率为 :code:`1e-3 * 1.0`;当已训练pass数大于1小于等于2时,学习率为 :code:`1e-3 * 0.9`;当已训练pass数大于2时,学习率为 :code:`1e-3 * 0.8`。
3. 如何初始化参数
-----------------
默认情况下,PaddlePaddle使用均值0,标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式,PaddlePaddle目前提供两种参数初始化的方式\:
* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
比如设置一个全连接层的参数初始化方式和bias初始化方式,可以使用如下代码。
.. code-block:: python
hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
4. 如何共享参数
---------------
PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字的参数,会共享参数。设置参数的名字,可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式,是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。
简单的全连接网络,参数共享的配置示例为\:
.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
5. 如何加载预训练参数
------------------------
* 对加载预训练参数的层,设置其参数属性 :code:`is_static=True`,使该层的参数在训练过程中保持不变。以embedding层为例,代码如下:
.. code-block:: python
emb_para = paddle.attr.Param(name='emb', is_static=True)
paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
* 从模型文件将预训练参数载入 :code:`numpy.array`,在创建parameters后,使用 :code:`parameters.set()` 加载预训练参数。PaddlePaddle保存的模型参数文件前16字节为头信息,用户将参数载入 :code:`numpy.array` 时须从第17字节开始。以embedding层为例,代码如下:
.. code-block:: python
def load_parameter(file_name, h, w):
with open(file_name, 'rb') as f:
f.read(16) # skip header.
return np.fromfile(f, dtype=np.float32).reshape(h, w)
parameters = paddle.parameters.create(my_cost)
parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
6. 存储的参数格式是什么,如何和明文进行相互转化
--------------------------------------------------
PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数两部分组成。头信息中,1~4字节表示PaddlePaddle版本信息,请直接填充0;5~8字节表示每个参数占用的字节数,当保存的网络参数为float类型时为4,double类型时为8;9~16字节表示保存的参数总个数。
将PaddlePaddle保存的模型参数还原回明文时,可以使用相应数据类型的 :code:`numpy.array` 加载具体网络参数,此时可以跳过PaddlePaddle模型参数文件的头信息。若在PaddlePaddle编译时,未指定按照double精度编译,默认情况下按照float精度计算,保存的参数也是float类型。这时在使用 :code:`numpy.array` 时,一般设置 :code:`dtype=float32` 。示例如下:
.. code-block:: python
def read_parameter(fname, width):
s = open(fname).read()
# skip header
vec = np.fromstring(s[16:], dtype=np.float32)
# width is the size of the corresponding layer
np.savetxt(fname + ".csv", vec.reshape(width, -1),
fmt="%.6f", delimiter=",")
将明文参数转化为PaddlePaddle可加载的模型参数时,首先构造头信息,再写入网络参数。下面的代码将随机生成的矩阵转化为可以被PaddlePaddle加载的模型参数。
.. code-block:: python
def gen_rand_param(param_file, width, height, need_trans):
np.random.seed()
header = struct.pack("iil", 0, 4, height * width)
param = np.float32(np.random.rand(height, width))
with open(param_file, "w") as fparam:
fparam.write(header + param.tostring())
7. A protocol message was rejected because it was too big
------------------------------------------------------------
如果在训练NLP相关模型时,出现以下错误:
.. code-block:: bash
[libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
可能的原因是:传给dataprovider的某一个args过大,一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似:
.. code-block:: python
src_dict = dict()
for line_count, line in enumerate(open(src_dict_path, "r")):
src_dict[line.strip()] = line_count
define_py_data_sources2(
train_list,
test_list,
module="dataprovider",
obj="process",
args={"src_dict": src_dict})
解决方案是:将字典的地址作为args传给dataprovider,然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为:
.. code-block:: python
define_py_data_sources2(
train_list,
test_list,
module="dataprovider",
obj="process",
args={"src_dict_path": src_dict_path})
完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
......@@ -20,7 +20,7 @@ Docker使用入门
docker pull paddlepaddle/paddle:0.10.0
来下载Docker镜像,paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的,推荐国内用户使用ocker.paddlepaddle.org/paddle下载。
来下载Docker镜像,paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的,推荐国内用户使用docker.paddlepaddle.org/paddle下载。
- *容器*: 如果说一个Docker镜像就是一个程序,那容器就是这个程序运行时产生的“进程”。
实际上,一个容器就是一个操作系统的进程,但是是运行在独立的进程空间,文件系统以及网络之上。
......
......@@ -34,7 +34,7 @@ Kernel实现 | CPU、GPU共享Kernel实现在`.h`文件中,否则,CPU
注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,GPU实现在`.cu`文件中
实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc``*_op.cu`(如有)结尾。
实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc``*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
下面以矩阵乘操作,即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
......@@ -54,9 +54,9 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
public:
MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The first input of mul op");
AddInput("Y", "The second input of mul op");
AddOutput("Out", "The output of mul op");
AddInput("X", "(Tensor), 2D tensor of size (M x K)");
AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
AddComment(R"DOC(
Two Element Mul Operator.
The equation is: Out = X * Y
......@@ -72,7 +72,7 @@ The equation is: Out = X * Y
构造函数里通过`AddInput`添加输入参数,通过`AddOutput`添加输出参数,通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
上面的代码在`MulOp`中添加两个输入`X``Y`,添加了一个输出`Out`,并解释了各自含义,命名请遵守命名规范
上面的代码在`MulOp`中添加两个输入`X``Y`,添加了一个输出`Out`,并解释了各自含义,命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md)
再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例:
......@@ -206,7 +206,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
- `REGISTER_OP` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker`为`ops::MulOpMaker`,注册`ops::MulOpGrad`,类型名为`mul_grad`。
- `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op。
- `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulKernel`类。
- `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulGradKernel`类。
-`.cu`文件中注册GPU Kernel。
......@@ -224,45 +224,15 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
### 5. 编译
- 简单**无特殊依赖**的OP无需修改CMakeList.txt文件。[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt) 会自动将 `paddle/operators` 目录下新增的 `*_op.cc` 文件加入编译。
- 较为复杂、**有额外依赖** 的operator仍需要修改[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)。如,`mul_op` 依赖 `math_function`,需要在`CMakeLists.txt`中添加如下内容:
运行下面命令可以进行编译:
```
op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function) +
```
- 运行下面命令可以进行编译:
```
make mul_op
```
```
make mul_op
```
## 绑定Python
- 绑定Python
在 [`paddle/pybind/pybind.cc
`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) 使用`USE_OP`告知编译器需要链接的Op,具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
```
USE_OP(mul);
```
如果只实现了CPU版本,则使用`USE_CPU_ONLY_OP`:
```
USE_CPU_ONLY_OP(gather);
```
如果OP不带Kernel,则使用`USE_NO_KENREL_OP`:
```
USE_NO_KENREL_OP(recurrent);
```
- 生成库
`paddle/operators` 目录下新增的 `*_op.cc` 文件会被自动添加链接到生成的lib库中。
系统会对新增的op自动绑定Python,并链接到生成的lib库中。
## 实现单元测试
......@@ -315,41 +285,27 @@ class TestMulGradOp(GradientChecker):
'Y': np.random.random((84, 100)).astype("float32")
}
def test_cpu_gpu_compare(self):
self.compare_grad(self.op, self.inputs)
def test_normal(self):
def test_check_grad_normal(self):
# mul op will enlarge the relative error
self.check_grad(
self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
def test_ignore_x(self):
def test_check_grad_ingore_x(self):
self.check_grad(
self.op,
self.inputs, ["Y"],
"Out",
max_relative_error=0.5,
no_grad_set={"X"})
['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
def test_ignore_y(self):
def test_check_grad_ingore_y(self):
self.check_grad(
self.op,
self.inputs, ["X"],
"Out",
max_relative_error=0.5,
no_grad_set={"Y"})
['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
```
下面解释代码中一些关键的地方:
- 调用`create_op("mul")`创建反向Op对应的前向Op。
- 调用`compare_grad`函数对比CPU、GPU计算结果。
- `test_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
- 第一个参数`self.op` : 前向Op。
- 第二个参数`self.inputs` : 输入词典,词典的Key和`ProtoMaker`定义保持一致。
- 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
- 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
- `test_ignore_x`和`test_ignore_y`分支用来测试只需要计算一个输入梯度的情况。
- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
- 第一个参数`["X", "Y"]` : 指定对输入变量`X``Y`做梯度检测。
- 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
- 第三个参数`max_relative_error`:指定检测梯度时能容忍的最大错误值。
- `test_check_grad_ingore_x``test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
### 编译和执行单元测试
......@@ -367,3 +323,10 @@ make test ARGS="-R test_mul_op -V"
```bash
ctest -R test_mul_op
```
## 注意事项
- 为每个Op创建单独的`*_op.h`(如有)、`*_op.cc``*_op.cu`(如有)。不允许一个文件中包含多个Op,这将会导致编译出错。
- 注册Op时的类型名,需要和该Op的名字一样。即不允许在`A_op.cc`里面,注册`REGISTER_OP(B, ...)`等,这将会导致单元测试出错。
- 如果Op没有实现GPU Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。
- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。
# How to write a new operator
- [Background](#background)
- [Implementing C++ Types](#implementing-c++-types)
- [Defining ProtoMaker](#defining-protoMaker)
- [Defining Operator](#defining-operator)
- [Registering Operator](#registering-operator)
- [Compilation](#compilation)
- [Python Binding](#python-binding)
- [Unit Tests](#unit-tests)
- [Testing Forward Operators](#testing-forward-operators)
- [Testing Backward Operators](#testing-backward-operators)
- [Compiling and Running](#compiling-and-running)
- [Remarks](#remarks)
## Background
Here are the base types needed. For details, please refer to the design docs.
- `framework::OperatorBase`: Operator (Op)base class.
- `framework::OpKernel`: Base class for Op computation.
- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation.
- `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API.
An operator can be differentiated by whether in has kernel methods. An operator with kernel inherits from `OperatorWithKernel` while the ones without inherit from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
Information | Where is it defined
-------------- | :----------------------
OpProtoMake definition | `.cc`files, Backward Op does not need an OpProtoMake interface.
Op definition | `.cc` files
Kernel implementation | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files.
Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation.
New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
## Implementing C++ Types
### 1. Defining Class ProtoMaker
Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
First, define `ProtoMaker` to describe the Operator's input, output, and additional comments:
```cpp
class MulOpMaker : public framework::OpProtoAndCheckerMaker {
public:
MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor), 2D tensor of size (M x K)");
AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
AddComment(R"DOC(
Two Element Mul Operator.
The equation is: Out = X * Y
)DOC");
}
};
```
[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor:
- `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
- `framework::OpAttrChecker` is used to validate variable attributes.
The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md).
An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows:
```cpp
template <typename AttrType>
class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input tensor of scale operator.").NotInGradient();
AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
AddComment(R"DOC(Scale operator
The equation is: Out = scale*X
)DOC");
AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
}
};
```
There are two changes in this example:
- `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`.
- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0.
### 2. Defining Operator
The following code defines the interface for MulOp:
```cpp
class MulOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
auto dim0 = ctx.Input<Tensor>("X")->dims();
auto dim1 = ctx.Input<Tensor>("Y")->dims();
PADDLE_ENFORCE_EQ(dim0.size(), 2,
"input X(%s) should be a tensor with 2 dims, a matrix",
ctx.op_.Input("X"));
PADDLE_ENFORCE_EQ(dim1.size(), 2,
"input Y(%s) should be a tensor with 2 dims, a matrix",
ctx.op_.Input("Y"));
PADDLE_ENFORCE_EQ(
dim0[1], dim1[0],
"First matrix's width must be equal with second matrix's height.");
ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
}
};
```
[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member
```cpp
using framework::OperatorWithKernel::OperatorWithKernel;
```
expresses an operator constructor using base class `OperatorWithKernel`, alternatively written as
```cpp
MulOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
```
`InferShape` interface needs to be re-written.`InferShape` is a constant method and cannot modify Op's member variables, its constant member `const framework::InferShapeContext &ctx` can be used to extract input, output, and attributes. It functions to
- 1). validate and error out early: it checks input data dimensions and types.
- 2). configures the tensor shape in the output.
Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
### 3. Defining OpKernel
`MulKernel` inherits `framework::OpKernel`, which includes the following templates:
- `typename Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
- `typename T` denotes data type, such as `float` or `double`.
`MulKernel` types need to rewrite the interface for `Compute`.
- `Compute` takes one input variable `const framework::ExecutionContext& context`.
- Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables.
- `Compute` implements the computation logics of an `OpKernel`.
`MulKernel`'s implementation of `Compute` is as follows:
```cpp
template <typename Place, typename T>
class MulKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<Tensor>("X");
auto* Y = context.Input<Tensor>("Y");
auto* Z = context.Output<Tensor>("Out");
Z->mutable_data<T>(context.GetPlace());
auto* device_context =
const_cast<platform::DeviceContext*>(context.device_context_);
math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
}
};
```
Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
### 4. Registering Operator
- In `.cc` files, register forward and backward operator classes and the CPU kernel.
```cpp
namespace ops = paddle::operators;
REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(mul_grad,
ops::MulGradKernel<paddle::platform::CPUPlace, float>);
```
In that code block,
- `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
- `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
- `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
- Registering GPU Kernel in `.cu` files
- Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
```cpp
// if use Eigen unsupported module before include head files
#define EIGEN_USE_GPU
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(mul_grad,
ops::MulGradKernel<paddle::platform::GPUPlace, float>);
```
### 5. Compilation
Run the following commands to compile.
```
make mul_op
```
## Python Binding
The system will automatically bind to Python and link it to a generated library.
## Unit Tests
Unit tests for an operator include
1. comparing a forward operator's implementations on different devices,
2. comparing a backward operator's implementation on different devices, and
3. a scaling test for the backward operator.
Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py).
### Testing Forward Operators
A forward operator unit test inherits `unittest.TestCase` and defines metaclass `__metaclass__ = OpTestMeta`. More concrete tests are performed in `OpTestMeta`. Testing a forward operator requires the following:
1. Defining input, output and relevant attributes in `setUp` method.
2. Generating random input data.
3. Implementing the same computation logic in a Python script:
```python
import unittest
import numpy as np
from gradient_checker import GradientChecker, create_op
from op_test_util import OpTestMeta
class TestMulOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "mul"
self.inputs = {
'X': np.random.random((32, 84)).astype("float32"),
'Y': np.random.random((84, 100)).astype("float32")
}
self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
```
Get its output, and compare it with the forward operator's own output.
The code above first loads required packages. In addition, we have
- `self.type = "mul" ` defines the type that is identical to what the operator's registered type.
- `self.inputs` defines input, with type `numpy.array` and initializes it.
- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.
### Testing Backward Operators
A backward operator unit test inherits `GradientChecker`, which inherits `unittest.TestCase`. As a result, **a backward operator unit test needs to be have the prefix `test_`**.
```python
class TestMulGradOp(GradientChecker):
def setUp(self):
self.op = create_op("mul")
self.inputs = {
'X': np.random.random((32, 84)).astype("float32"),
'Y': np.random.random((84, 100)).astype("float32")
}
def test_check_grad_normal(self):
# mul op will enlarge the relative error
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
def test_check_grad_ingore_x(self):
self.check_grad(
['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
def test_check_grad_ingore_y(self):
self.check_grad(
['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
```
Some key points in the code above include:
- `create_op("mul")` creates the backward operator's corresponding forward operator.
- `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
- The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
- The second variable `"Out"` points to the network's final output target `Out`.
- The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests.
- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input.
### Compiling and Running
Any new unit testing file of the format `test_*.py` added to the director `python/paddle/v2/framework/tests` is automatically added to the project to compile.
Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`.
After successfully compiling the project, run the following command to run unit tests:
```bash
make test ARGS="-R test_mul_op -V"
```
Or,
```bash
ctest -R test_mul_op
```
## Remarks
- Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
- If the operator does not implement a GPU kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
- If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
## How to use Eigen in Paddle
Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`.
### Eigen Tensor Module
The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU.
Note that Eigen Tensor is still being actively developed, so its tests are not completely covered and its documentation may be sparse.
For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md).
### paddle::framework::Tensor
Paddle Tensor's is defined in the framework directory with the following interface:
```cpp
class Tensor {
public:
/*! Return a pointer to mutable memory block. */
template <typename T>
inline T* data();
/**
* @brief Return a pointer to mutable memory block.
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(platform::Place place);
/**
* @brief Return a pointer to mutable memory block.
*
* @param[in] dims The dimensions of the memory block.
* @param[in] place The place of the memory block.
*
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(DDim dims, platform::Place place);
/*! Resize the dimensions of the memory block. */
inline Tensor& Resize(const DDim& dims);
/*! Return the dimensions of the memory block. */
inline const DDim& dims() const;
private:
/*! holds the memory block if allocated. */
std::shared_ptr<Placeholder> holder_;
/*! points to dimensions of memory block. */
DDim dim_;
};
```
`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
```cpp
paddle::framework::Tensor t;
paddle::platform::CPUPlace place;
// set size first
t.Resize({2, 3});
// allocate memory on CPU later
t.mutable_data(place);
```
### paddle::framework::Tensor Usage
`AddOp` demonstrates Tensor's usage.
- InferShape
When computing a neural network's compute graph, first call every `Operator`'s `InferShape` method, and use `Resize` to configure the size of the output tensor.
```cpp
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
ctx.Input<Tensor>("Y")->dims(),
"Two input of Add Op's dimension must be same.");
ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
}
```
- Run
```cpp
void Compute(const framework::ExecutionContext& context) const override {
auto* input0 = context.Input<Tensor>("X");
auto* input1 = context.Input<Tensor>("Y");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());
auto x = EigenVector<T>::Flatten(*input0);
auto y = EigenVector<T>::Flatten(*input1);
auto z = EigenVector<T>::Flatten(*output);
auto place = context.GetEigenDevice<Place>();
z.device(place) = x + y;
}
```
### paddle::framework::Tensor到EigenTensor的转换
As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
Using EigenTensor as an example:
```cpp
Tensor t;
float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
for (int i = 0; i < 1 * 2 * 3; i++) {
p[i] = static_cast<float>(i);
}
EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
```
`From` is an interfacing method provided by the EigenTensor template, which implements the transformation from a `paddle::framework::Tensor` object to an EigenTensor. Since `rank` is a template parameter, it needs to be explicitly specified at the time of the transformation.
In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector<T>::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector<T>::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector.
For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc) in the `eigen_test.cc` file.
### Implementing Computation
While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor.
```cpp
auto x = EigenVector<T>::Flatten(*input0);
auto y = EigenVector<T>::Flatten(*input1);
auto z = EigenVector<T>::Flatten(*output);
auto place = context.GetEigenDevice<Place>();
z.device(place) = x + y;
```
In this code segment, input0/input1/output can be Tensors of arbitrary dimension. We are calling Flatten from EigenVector, transforming a tensor of any dimension into a 1-dimensional EigenVector. After completing computation, input0/input1/output will retain the same shape information, and they can be resized using the `Resize` interface.
Because the Eigen Tensor module is under-documented, please refer to `OpKernel`'s computation code in TensorFlow's [kernel module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels).
# Cluster bootstrapping tool survey
## Abstract
In order to bring up a cluster from bare metal machine to a fully functional kubernetes cluster for Paddlepaddle to run, we need to utilize some tools. Here we are going to compare [Sextant](https://github.com/k8sp/sextant) and [Tectonic installer](https://github.com/coreos/tectonic-installer)
## Basic assumptions
Here are some basic assumptions before we move on to details
1. You are an administrator of a bare metal machine cluster, which means:
* you have full control to each of the machines.
* you have full control to the network which machines are connected to.
2. Machines can be booted from network with PEX or iPXE
3. You understand the [general procedure to bring up a cluster](#appendix-general-procedure-to-bring-up-a-cluster)
if your cluster is able to mark above items with checkmarks, then keep reading.
## Comparing Sextant and Tectonic installer
### Sextant
Sextant is an end2end solution to bring up a bare metal cluster to a fully functional k8s cluster, it integrates DHCP, name service, PEX, cloud-config-service, docker registry services altogether.
#### Pros
1. End2End: basically all admin need to do is to config the cluster.yaml and power on the cluster.
2. Offline cluster configuration: Sextant has 2 phases during working with it, config time and deploy time. when admin is configuring, it requires admin's machine has internet connectivity, which will download some images, etc. But in deploy time, it's completely OK to go offline since all dependencies are ready during config time.
3. docker registry integrated.
4. GPU machine took care of.
### Cons
1. k8s API server is not deployed with high availability in considering by default.
2. No grouping support.
3. No API interface, a one-off service.
### Tectonic installer
First of all, Tectonic is not free, it requires coreos.com account as a step of installation, and free user can only create less than 10 nodes.
Tectonic is a suite of software which wraps around k8s and providing more utility regarding dev ops, ie,
Tectonic installer as it's named, it installs Tectonic to a bare metal cluster which means it's not totally an equivalent of Sextant. At the "booting a cluster" part, it mostly utilizes [Matchbox](https://github.com/coreos/matchbox), which is a general cluster bootstrapper.
Matchbox's Approach is similar to Sexstant.
### Pros
1. supports grouping machines.
2. supports running provisioning service in rtk. (not a big deal though).
3. supports http/gRPC API interface.
4. supports multi-template.
### Cons
1. Not an e2e solution to bring up a cluster, need a lot of extra work and other software.
2. [Not fully supporting](https://github.com/coreos/matchbox/issues/550) centOS deployment yet.
## Conclusion
Sextant is a better solution overall for paddle cloud deploying to a bare metal cluster. It would be great if Sextant can also 1) deploy k8s api server with high availability by default; 2) not designed as a one-off service.
## Appendix: General procedure to bring up a cluster
It's physically impossible for a cluster admin to manually install OS and applications into cluster nodes one by one, here is what an admin would do in cloud industry:
1. setup a bootstrap machine with static IP in the cluster, which has following services:
* DHCP: assigns ip address for rest of the nodes.
* name service: to map node name to a IP
* PXE related services: the booting related info will be delivered to newly booted machines as their IP is assigned via DHCP service, PXE service will provide further booting and installing info and image with TFTP and http protocol.
* cluster config service: this is for providing cluster node with OS config via http
* optional docker registry: a built-in docker registry makes the whole cluster independent from connecting internet, and speeds up software distribution.
2. New node powers on, it will
* broadcast the request for an IP address
* DHCP server assigns the IP address, and deliver the PXE booting related info to the node.
* cluster node will request config files with booting info delivered with DHCP via the TFTP service, and in most of the cases, the config file will point to a http service for the booting image.
* Since PXE is configured with initrd, it will utilize the cloud config service and do further installations like coreOS or K8s installations.
* then restart the node.
For further understanding, following 2 links from Matchbox are some good readings:
* [Machine lifecycle](https://github.com/coreos/matchbox/blob/master/Documentation/machine-lifecycle.md)
* [PXE booting](https://github.com/coreos/matchbox/blob/master/Documentation/network-booting.md)
......@@ -28,53 +28,47 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
add_dependencies(paddle_capi paddle_proto)
# combine all paddle static libraries together, into libpaddle_capi_whole.a
# user should use PaddleCAPI as -lpaddle_capi_whole
set(capi_whole_library libpaddle_capi_whole.a)
add_custom_target(paddle_capi_whole ALL
COMMAND mkdir -p o_files/capi && cd o_files/capi/ && ar -x $<TARGET_FILE:paddle_capi>
COMMAND mkdir -p o_files/utils && cd o_files/utils/ && ar -x $<TARGET_FILE:paddle_utils>
COMMAND mkdir -p o_files/parameter && cd o_files/parameter/ && ar -x $<TARGET_FILE:paddle_parameter>
COMMAND mkdir -p o_files/math && cd o_files/math/ && ar -x $<TARGET_FILE:paddle_math>
COMMAND mkdir -p o_files/cuda && cd o_files/cuda/ && ar -x $<TARGET_FILE:paddle_cuda>
COMMAND mkdir -p o_files/function && cd o_files/function/ && ar -x $<TARGET_FILE:paddle_function>
COMMAND mkdir -p o_files/gserver && cd o_files/gserver/ && ar -x $<TARGET_FILE:paddle_gserver>
COMMAND mkdir -p o_files/proto && cd o_files/proto/ && ar -x $<TARGET_FILE:paddle_proto>
COMMAND mkdir -p o_files/network && cd o_files/network/ && ar -x $<TARGET_FILE:paddle_network>
COMMAND mkdir -p o_files/pserver && cd o_files/pserver/ && ar -x $<TARGET_FILE:paddle_pserver>
COMMAND ar crs ${capi_whole_library} `find ./o_files -name '*.o'`
COMMAND rm -rf o_files
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
DEPENDS paddle_capi paddle_utils paddle_parameter paddle_math
paddle_cuda paddle_function paddle_gserver
paddle_proto paddle_pserver paddle_network
)
set_target_properties(paddle_capi_whole
PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library})
set(PADDLE_CAPI_INFER_LIBS
paddle_utils
paddle_parameter
paddle_math
paddle_cuda
paddle_function
paddle_gserver
paddle_proto
paddle_pserver
paddle_network)
set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
# TODO: merge mkl into paddle_capi_shared
add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
set_target_properties(paddle_capi_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
link_paddle_exe(paddle_capi_shared)
cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
# No shared library for iOS
if(NOT IOS)
set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
# TODO: merge mkl into paddle_capi_shared
add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
set_target_properties(paddle_capi_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
link_paddle_exe(paddle_capi_shared)
endif()
# install library & headers.
install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
if(ANDROID)
install(TARGETS paddle_capi_whole paddle_capi_shared
ARCHIVE DESTINATION lib/${ANDROID_ABI}
LIBRARY DESTINATION lib/${ANDROID_ABI})
execute_process(
COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE GIT_COMMITS_LIST
RESULT_VARIABLE GIT_COMMITS_LIST_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(${GIT_COMMITS_LIST_RESULT})
set(GIT_COMMITS_LIST "No commits.")
endif()
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library}
DESTINATION lib/${ANDROID_ABI})
install(TARGETS paddle_capi_shared DESTINATION lib/${ANDROID_ABI})
install(CODE "FILE(WRITE ${CMAKE_INSTALL_PREFIX}/lib/${ANDROID_ABI}/BUILD.txt
\"Compiler:\n\"
\"\\t${CMAKE_C_COMPILER}\\n\"
......@@ -88,8 +82,10 @@ if(ANDROID)
)"
)
else(ANDROID)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib)
install(TARGETS paddle_capi_whole ARCHIVE DESTINATION lib)
if(NOT IOS)
install(TARGETS paddle_capi_shared DESTINATION lib)
endif()
endif(ANDROID)
# this variable used for unittest
......
......@@ -22,10 +22,10 @@ limitations under the License. */
*/
typedef enum {
HL_POOLING_MAX = 0,
// average includes padded values
HL_POOLING_AVERAGE = 1,
// average does not include padded values
HL_POOLING_AVERAGE_EXCLUDE_PADDING = 2,
HL_POOLING_AVERAGE = 1,
// average includes padded values
HL_POOLING_AVERAGE_INCLUDE_PADDING = 2,
HL_POOLING_END
} hl_pooling_mode_t;
......
......@@ -461,7 +461,7 @@ class add<float32x4_t> {
public:
INLINE float32x4_t operator()(const float32x4_t a,
const float32x4_t b) const {
return vmulq_f32(a, b);
return vaddq_f32(a, b);
}
};
......
......@@ -211,13 +211,11 @@ __global__ void KeAvgPoolForward(const int nthreads,
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int pool_size = (hend - hstart) * (wend - wstart);
int hend = min(hstart + sizeY, height);
int wend = min(wstart + sizeX, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
hend = min(hend, height);
wend = min(wend, width);
int pool_size = (hend - hstart) * (wend - wstart);
real aveval = 0;
inputData += (frameNum * channels + c) * height * width;
......@@ -299,12 +297,14 @@ __global__ void KeAvgPoolBackward(const int nthreads,
outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
for (int ph = phstart; ph < phend; ++ph) {
int hstart = ph * strideH - padH;
int hend = min(hstart + sizeY, height);
hstart = max(hstart, 0);
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int wend = min(wstart + sizeX, width);
wstart = max(wstart, 0);
int poolsize = (hend - hstart) * (wend - wstart);
gradient += outGrad[ph * pooledW + pw] / poolsize;
}
......@@ -600,16 +600,13 @@ __global__ void KeAvgPool3DForward(const int nthreads,
int dstart = pd * strideD - padD;
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int dend = min(dstart + sizeZ, depth + padD);
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
int dend = min(dstart + sizeZ, depth);
int hend = min(hstart + sizeY, height);
int wend = min(wstart + sizeX, width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
dend = min(dend, depth);
hend = min(hend, height);
wend = min(wend, width);
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
real aveval = 0;
inputData += (frameNum * channels + c) * depth * height * width;
......@@ -712,15 +709,18 @@ __global__ void KeAvgPool3DBackward(const int nthreads,
outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;
for (int pd = pdstart; pd < pdend; ++pd) {
int dstart = pd * strideD - padD;
int dend = min(dstart + sizeZ, depth);
dstart = max(dstart, 0);
for (int ph = phstart; ph < phend; ++ph) {
int hstart = ph * strideH - padH;
int hend = min(hstart + sizeY, height);
hstart = max(hstart, 0);
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int dstart = pd * strideD - padD;
int hstart = ph * strideH - padH;
int wstart = pw * strideW - padW;
int dend = min(dstart + sizeZ, depth + padD);
int hend = min(hstart + sizeY, height + padH);
int wend = min(wstart + sizeX, width + padW);
int wend = min(wstart + sizeX, width);
wstart = max(wstart, 0);
int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
}
......
......@@ -432,11 +432,11 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
cudnn_mode = CUDNN_POOLING_MAX;
break;
case HL_POOLING_AVERAGE:
cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
break;
case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
break;
case HL_POOLING_AVERAGE_INCLUDE_PADDING:
cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
break;
default:
LOG(FATAL) << "parameter mode error";
}
......
......@@ -19,14 +19,17 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
proto_library(framework_proto SRCS framework.proto)
cc_library(attribute SRCS attribute.cc DEPS framework_proto)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto proto_desc)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder)
cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator proto_desc)
cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder op_proto_maker op_info)
cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry sum_op)
py_proto_compile(framework_py_proto SRCS framework.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
......@@ -40,3 +43,6 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
此差异已折叠。
......@@ -21,24 +21,37 @@ limitations under the License. */
#include <vector>
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/type_defs.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/variant.h"
namespace paddle {
namespace framework {
typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
std::vector<float>, std::vector<std::string>,
std::vector<std::pair<int, int>>>
Attribute;
typedef std::unordered_map<std::string, Attribute> AttributeMap;
ProgramDesc& GetProgramDesc();
template <typename T>
AttrType AttrTypeID();
inline AttrType AttrTypeID() {
Attribute tmp = T();
return static_cast<AttrType>(tmp.which() - 1);
}
Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
class AttrReader {
public:
explicit AttrReader(const AttributeMap& attrs) : attrs_(attrs) {}
template <typename T>
inline const T& Get(const std::string& name) const {
PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
name);
return boost::get<T>(attrs_.at(name));
}
private:
const AttributeMap& attrs_;
};
// check whether a value(attribute) fit a certain limit
template <typename T>
class GreaterThanChecker {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册