提交 ee3483b0 编写于 作者: Y Yancey1989

Merge branch 'develop' of github.com:PaddlePaddle/Paddle into fix_404_dist_train

...@@ -16,6 +16,8 @@ cmake_minimum_required(VERSION 3.0) ...@@ -16,6 +16,8 @@ cmake_minimum_required(VERSION 3.0)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
include(system) include(system)
...@@ -54,6 +56,7 @@ option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) ...@@ -54,6 +56,7 @@ option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON) option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
# CMAKE_BUILD_TYPE # CMAKE_BUILD_TYPE
...@@ -67,9 +70,6 @@ if(ANDROID OR IOS) ...@@ -67,9 +70,6 @@ if(ANDROID OR IOS)
if(ANDROID) if(ANDROID)
if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16") if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16") message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
# TODO: support glog for Android api 16 ~ 19 in the future
message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
endif() endif()
endif() endif()
...@@ -83,6 +83,8 @@ if(ANDROID OR IOS) ...@@ -83,6 +83,8 @@ if(ANDROID OR IOS)
"Disable RDMA when cross-compiling for Android and iOS" FORCE) "Disable RDMA when cross-compiling for Android and iOS" FORCE)
set(WITH_MKL OFF CACHE STRING set(WITH_MKL OFF CACHE STRING
"Disable MKL when cross-compiling for Android and iOS" FORCE) "Disable MKL when cross-compiling for Android and iOS" FORCE)
set(WITH_GOLANG OFF CACHE STRING
"Disable golang when cross-compiling for Android and iOS" FORCE)
# Compile PaddlePaddle mobile inference library # Compile PaddlePaddle mobile inference library
if (NOT WITH_C_API) if (NOT WITH_C_API)
......
...@@ -6,10 +6,21 @@ width = 224 ...@@ -6,10 +6,21 @@ width = 224
num_class = 1000 num_class = 1000
batch_size = get_config_arg('batch_size', int, 128) batch_size = get_config_arg('batch_size', int, 128)
use_gpu = get_config_arg('use_gpu', bool, True) use_gpu = get_config_arg('use_gpu', bool, True)
is_infer = get_config_arg("is_infer", bool, False)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer
}
define_py_data_sources2( define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args) "train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings( settings(
batch_size=batch_size, batch_size=batch_size,
...@@ -146,7 +157,6 @@ def inception(name, input, channels, \ ...@@ -146,7 +157,6 @@ def inception(name, input, channels, \
return cat return cat
lab = data_layer(name="label", size=1000)
data = data_layer(name="input", size=3 * height * width) data = data_layer(name="input", size=3 * height * width)
# stage 1 # stage 1
...@@ -224,6 +234,10 @@ pool5 = img_pool_layer( ...@@ -224,6 +234,10 @@ pool5 = img_pool_layer(
dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4) dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
out3 = fc_layer( out3 = fc_layer(
name="output3", input=dropout, size=1000, act=SoftmaxActivation()) name="output3", input=dropout, size=1000, act=SoftmaxActivation())
loss3 = cross_entropy(name='loss3', input=out3, label=lab)
outputs(loss3) if is_infer:
outputs(out3)
else:
lab = data_layer(name="label", size=num_class)
loss3 = cross_entropy(name='loss3', input=out3, label=lab)
outputs(loss3)
...@@ -13,14 +13,20 @@ def initHook(settings, height, width, color, num_class, **kwargs): ...@@ -13,14 +13,20 @@ def initHook(settings, height, width, color, num_class, **kwargs):
settings.data_size = settings.height * settings.width * 3 settings.data_size = settings.height * settings.width * 3
else: else:
settings.data_size = settings.height * settings.width settings.data_size = settings.height * settings.width
settings.is_infer = kwargs.get('is_infer', False)
if settings.is_infer:
settings.slots = [dense_vector(settings.data_size)]
else:
settings.slots = [dense_vector(settings.data_size), integer_value(1)] settings.slots = [dense_vector(settings.data_size), integer_value(1)]
@provider( @provider(
init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM) init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_list): def process(settings, file_list):
for i in xrange(1024): for i in xrange(2560 if settings.is_infer else 1024):
img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten() img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
if settings.is_infer:
yield img.astype('float32')
else:
lab = random.randint(0, settings.num_class - 1) lab = random.randint(0, settings.num_class - 1)
yield img.astype('float32'), int(lab) yield img.astype('float32'), int(lab)
...@@ -6,11 +6,21 @@ width = 224 ...@@ -6,11 +6,21 @@ width = 224
num_class = 1000 num_class = 1000
batch_size = get_config_arg('batch_size', int, 64) batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg("layer_num", int, 50) layer_num = get_config_arg("layer_num", int, 50)
is_test = get_config_arg("is_test", bool, False) is_infer = get_config_arg("is_infer", bool, False)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer
}
define_py_data_sources2( define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args) "train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings( settings(
batch_size=batch_size, batch_size=batch_size,
...@@ -45,7 +55,10 @@ def conv_bn_layer(name, ...@@ -45,7 +55,10 @@ def conv_bn_layer(name,
act=LinearActivation(), act=LinearActivation(),
bias_attr=False) bias_attr=False)
return batch_norm_layer( return batch_norm_layer(
name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test) name=name + "_bn",
input=tmp,
act=active_type,
use_global_stats=is_infer)
def bottleneck_block(name, input, num_filters1, num_filters2): def bottleneck_block(name, input, num_filters1, num_filters2):
...@@ -207,7 +220,9 @@ elif layer_num == 152: ...@@ -207,7 +220,9 @@ elif layer_num == 152:
else: else:
print("Wrong layer number.") print("Wrong layer number.")
lbl = data_layer(name="label", size=num_class) if is_infer:
loss = cross_entropy(name='loss', input=resnet, label=lbl) outputs(resnet)
inputs(img, lbl) else:
outputs(loss) lbl = data_layer(name="label", size=num_class)
loss = cross_entropy(name='loss', input=resnet, label=lbl)
outputs(loss)
set -e
function clock_to_seconds() {
hours=`echo $1 | awk -F ':' '{print $1}'`
mins=`echo $1 | awk -F ':' '{print $2}'`
secs=`echo $1 | awk -F ':' '{print $3}'`
echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
}
function infer() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
topology=$1
layer_num=$2
bs=$3
use_mkldnn=$4
if [ $4 == "True" ]; then
thread=1
log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
elif [ $4 == "False" ]; then
thread=`nproc`
if [ $thread -gt $bs ]; then
thread=$bs
fi
log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
else
echo "Wrong input $4, use True or False."
exit 0
fi
models_in="models/${topology}-${layer_num}/pass-00000/"
if [ ! -d $models_in ]; then
echo "Training model ${topology}_${layer_num}"
paddle train --job=train \
--config="${topology}.py" \
--use_mkldnn=True \
--use_gpu=False \
--trainer_count=1 \
--num_passes=1 \
--save_dir="models/${topology}-${layer_num}" \
--config_args="batch_size=128,layer_num=${layer_num}" \
> /dev/null 2>&1
echo "Done"
fi
log_period=$((256 / bs))
paddle train --job=test \
--config="${topology}.py" \
--use_mkldnn=$use_mkldnn \
--use_gpu=False \
--trainer_count=$thread \
--log_period=$log_period \
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
--init_model_path=$models_in \
2>&1 | tee ${log}
# calculate the last 5 logs period time of 1280 samples,
# the time before are burning time.
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
start_sec=`clock_to_seconds $start`
end_sec=`clock_to_seconds $end`
fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
echo " " > train.list
fi
if [ ! -f "test.list" ]; then
echo " " > test.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
if [ ! -d "models" ]; then
mkdir -p models
fi
# inference benchmark
for use_mkldnn in True False; do
for batchsize in 1 2 4 8 16; do
infer googlenet v1 $batchsize $use_mkldnn
infer resnet 50 $batchsize $use_mkldnn
infer vgg 19 $batchsize $use_mkldnn
done
done
...@@ -8,13 +8,13 @@ function train() { ...@@ -8,13 +8,13 @@ function train() {
use_mkldnn=$4 use_mkldnn=$4
if [ $4 == "True" ]; then if [ $4 == "True" ]; then
thread=1 thread=1
log="logs/${topology}-${layer_num}-mkldnn-${bs}.log" log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
elif [ $4 == "False" ]; then elif [ $4 == "False" ]; then
thread=`nproc` thread=`nproc`
# each trainer_count use only 1 core to avoid conflict # each trainer_count use only 1 core to avoid conflict
log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log" log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
else else
echo "Wrong input $3, use True or False." echo "Wrong input $4, use True or False."
exit 0 exit 0
fi fi
args="batch_size=${bs},layer_num=${layer_num}" args="batch_size=${bs},layer_num=${layer_num}"
...@@ -30,13 +30,14 @@ function train() { ...@@ -30,13 +30,14 @@ function train() {
2>&1 | tee ${log} 2>&1 | tee ${log}
} }
if [ ! -d "train.list" ]; then if [ ! -f "train.list" ]; then
echo " " > train.list echo " " > train.list
fi fi
if [ ! -d "logs" ]; then if [ ! -d "logs" ]; then
mkdir logs mkdir logs
fi fi
# training benchmark
for use_mkldnn in True False; do for use_mkldnn in True False; do
for batchsize in 64 128 256; do for batchsize in 64 128 256; do
train vgg 19 $batchsize $use_mkldnn train vgg 19 $batchsize $use_mkldnn
......
...@@ -6,10 +6,21 @@ width = 224 ...@@ -6,10 +6,21 @@ width = 224
num_class = 1000 num_class = 1000
batch_size = get_config_arg('batch_size', int, 64) batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg('layer_num', int, 19) layer_num = get_config_arg('layer_num', int, 19)
is_infer = get_config_arg("is_infer", bool, False)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer
}
define_py_data_sources2( define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args) "train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings( settings(
batch_size=batch_size, batch_size=batch_size,
...@@ -98,6 +109,9 @@ elif layer_num == 19: ...@@ -98,6 +109,9 @@ elif layer_num == 19:
else: else:
print("Wrong layer number.") print("Wrong layer number.")
lab = data_layer('label', num_class) if is_infer:
loss = cross_entropy(input=vgg, label=lab) outputs(vgg)
outputs(loss) else:
lab = data_layer('label', num_class)
loss = cross_entropy(input=vgg, label=lab)
outputs(loss)
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
# #
IF(MOBILE_INFERENCE) IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
return() return()
ENDIF() ENDIF()
......
...@@ -26,12 +26,21 @@ ENDIF(WIN32) ...@@ -26,12 +26,21 @@ ENDIF(WIN32)
INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
# Using the unofficial glog for Android API < 21
SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git")
SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8")
ELSE()
SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
SET(GLOG_TAG "v0.3.5")
ENDIF()
ExternalProject_Add( ExternalProject_Add(
extern_glog extern_glog
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS gflags DEPENDS gflags
GIT_REPOSITORY "https://github.com/google/glog.git" GIT_REPOSITORY ${GLOG_REPOSITORY}
GIT_TAG v0.3.5 GIT_TAG ${GLOG_TAG}
PREFIX ${GLOG_SOURCES_DIR} PREFIX ${GLOG_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
# #
IF(MOBILE_INFERENCE) IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
return() return()
ENDIF() ENDIF()
...@@ -23,6 +23,11 @@ SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc) ...@@ -23,6 +23,11 @@ SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
IF(APPLE)
SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
ELSE()
SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
ENDIF()
ExternalProject_Add( ExternalProject_Add(
extern_grpc extern_grpc
...@@ -33,7 +38,11 @@ ExternalProject_Add( ...@@ -33,7 +38,11 @@ ExternalProject_Add(
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_IN_SOURCE 1 BUILD_IN_SOURCE 1
BUILD_COMMAND make # NOTE(yuyang18):
# Disable -Werror, otherwise the compile will fail in MacOS.
# It seems that we cannot configure that by make command.
# Just dry run make command and remove `-Werror`, then use a shell to run make commands
BUILD_COMMAND ${BUILD_CMD}
INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
) )
...@@ -55,4 +64,3 @@ SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION ...@@ -55,4 +64,3 @@ SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
include_directories(${GRPC_INCLUDE_DIR}) include_directories(${GRPC_INCLUDE_DIR})
ADD_DEPENDENCIES(grpc++_unsecure extern_grpc) ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
...@@ -15,7 +15,18 @@ ...@@ -15,7 +15,18 @@
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
FIND_PACKAGE(Protobuf QUIET) FIND_PACKAGE(Protobuf QUIET)
SET(PROTOBUF_FOUND "OFF") macro(UNSET_VAR VAR_NAME)
UNSET(${VAR_NAME} CACHE)
UNSET(${VAR_NAME})
endmacro()
UNSET_VAR(PROTOBUF_INCLUDE_DIR)
UNSET_VAR(PROTOBUF_FOUND)
UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
UNSET_VAR(PROTOBUF_LITE_LIBRARY)
UNSET_VAR(PROTOBUF_LIBRARY)
UNSET_VAR(PROTOBUF_INCLUDE_DIR)
UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined. if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined.
function(protobuf_generate_python SRCS) function(protobuf_generate_python SRCS)
...@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB) ...@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB)
# FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
# make `protobuf_generate_cpp` happy. # make `protobuf_generate_cpp` happy.
SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
FOREACH(dep ${protobuf_DEPS}) FOREACH(dep ${protobuf_DEPS})
ADD_DEPENDENCIES(protobuf ${dep}) ADD_DEPENDENCIES(protobuf ${dep})
ADD_DEPENDENCIES(protobuf_lite ${dep}) ADD_DEPENDENCIES(protobuf_lite ${dep})
...@@ -128,11 +138,11 @@ endmacro() ...@@ -128,11 +138,11 @@ endmacro()
set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
if (NOT "${PROTOBUF_ROOT}" STREQUAL "") if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include) find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib) find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib) find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib) find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin) find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE) if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.") message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
SET_PROTOBUF_VERSION() SET_PROTOBUF_VERSION()
...@@ -178,14 +188,26 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -178,14 +188,26 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}") SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
ENDIF() ENDIF()
SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
IF(MOBILE_INFERENCE)
# The reason why the official version is not used is described in
# https://github.com/PaddlePaddle/Paddle/issues/6114
SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git")
SET(PROTOBUF_TAG "v3.2.0")
IF(NOT BUILD_FOR_HOST)
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF")
ENDIF()
ENDIF()
ExternalProject_Add( ExternalProject_Add(
${TARGET_NAME} ${TARGET_NAME}
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${PROTOBUF_SOURCES_DIR} PREFIX ${PROTOBUF_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
DEPENDS zlib DEPENDS zlib
GIT_REPOSITORY "https://github.com/google/protobuf.git" GIT_REPOSITORY ${PROTOBUF_REPO}
GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546" GIT_TAG ${PROTOBUF_TAG}
CONFIGURE_COMMAND CONFIGURE_COMMAND
${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
${OPTIONAL_ARGS} ${OPTIONAL_ARGS}
...@@ -203,7 +225,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -203,7 +225,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
) )
ENDFUNCTION() ENDFUNCTION()
SET(PROTOBUF_VERSION 3.1) IF(NOT MOBILE_INFERENCE)
SET(PROTOBUF_VERSION 3.1)
ELSE()
SET(PROTOBUF_VERSION 3.2)
ENDIF()
IF(CMAKE_CROSSCOMPILING) IF(CMAKE_CROSSCOMPILING)
build_protobuf(protobuf_host TRUE) build_protobuf(protobuf_host TRUE)
LIST(APPEND external_project_dependencies protobuf_host) LIST(APPEND external_project_dependencies protobuf_host)
......
...@@ -111,6 +111,8 @@ set(COMMON_FLAGS ...@@ -111,6 +111,8 @@ set(COMMON_FLAGS
-Wno-error=sign-compare -Wno-error=sign-compare
-Wno-error=unused-local-typedefs -Wno-error=unused-local-typedefs
-Wno-error=parentheses-equality # Warnings in pybind11 -Wno-error=parentheses-equality # Warnings in pybind11
-Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3
-Wno-error=terminate # Warning in PADDLE_ENFORCE
) )
set(GPU_COMMON_FLAGS set(GPU_COMMON_FLAGS
......
...@@ -227,8 +227,8 @@ function(cc_test TARGET_NAME) ...@@ -227,8 +227,8 @@ function(cc_test TARGET_NAME)
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS}) add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif() endif()
endfunction(cc_test) endfunction(cc_test)
...@@ -288,8 +288,8 @@ function(nv_test TARGET_NAME) ...@@ -288,8 +288,8 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
endif() endif()
endfunction(nv_test) endfunction(nv_test)
...@@ -505,12 +505,12 @@ function(grpc_library TARGET_NAME) ...@@ -505,12 +505,12 @@ function(grpc_library TARGET_NAME)
set_source_files_properties( set_source_files_properties(
${grpc_grpc_srcs} ${grpc_grpc_srcs}
PROPERTIES PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}") cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
set_source_files_properties( set_source_files_properties(
${grpc_library_SRCS} ${grpc_library_SRCS}
PROPERTIES PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}") cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
endfunction() endfunction()
...@@ -7,3 +7,4 @@ API ...@@ -7,3 +7,4 @@ API
v2/model_configs.rst v2/model_configs.rst
v2/data.rst v2/data.rst
v2/run_logic.rst v2/run_logic.rst
v2/fluid.rst
======================
Fluid
======================
.. toctree::
:maxdepth: 1
fluid/layers.rst
fluid/data_feeder.rst
fluid/executor.rst
fluid/initializer.rst
fluid/evaluator.rst
fluid/nets.rst
fluid/optimizer.rst
fluid/param_attr.rst
fluid/profiler.rst
fluid/regularizer.rst
===========
DataFeeder
===========
DataFeeder
-----------
.. automodule:: paddle.v2.fluid.data_feeder
:members: DataFeeder
:noindex:
===========
Evaluator
===========
Evaluator
-----------
.. automodule:: paddle.v2.fluid.evaluator
:members: Evaluator
:noindex:
===========
Executor
===========
Executor
-----------
.. automodule:: paddle.v2.fluid.executor
:members: Executor
:noindex:
===========
Initializer
===========
Initializer
-----------
.. automodule:: paddle.v2.fluid.initializer
:members: Initializer
:noindex:
ConstantInitializer
-------------------
.. automodule:: paddle.v2.fluid.initializer
:members: ConstantInitializer
:noindex:
UniformInitializer
------------------
.. automodule:: paddle.v2.fluid.initializer
:members: UniformInitializer
:noindex:
NormalInitializer
-----------------
.. automodule:: paddle.v2.fluid.initializer
:members: NormalInitializer
:noindex:
XavierInitializer
-----------------
.. automodule:: paddle.v2.fluid.initializer
:members: XavierInitializer
:noindex:
MSRAInitializer
---------------
.. automodule:: paddle.v2.fluid.initializer
:members: MSRAInitializer
:noindex:
==========
Layers
==========
fc
---
.. autofunction:: paddle.v2.fluid.layers.fc
:noindex:
embedding
---------
.. autofunction:: paddle.v2.fluid.layers.embedding
:noindex:
dynamic_lstm
------------
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
:noindex:
data
---------
.. autofunction:: paddle.v2.fluid.layers.data
:noindex:
mean
---------
.. autofunction:: paddle.v2.fluid.layers.mean
:noindex:
mul
---------
.. autofunction:: paddle.v2.fluid.layers.mul
:noindex:
elementwise_add
---------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_add
:noindex:
elementwise_div
---------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_div
:noindex:
dropout
---------
.. autofunction:: paddle.v2.fluid.layers.dropout
:noindex:
reshape
---------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex:
sigmoid
---------
.. autofunction:: paddle.v2.fluid.layers.sigmoid
:noindex:
scale
---------
.. autofunction:: paddle.v2.fluid.layers.scale
:noindex:
reshape
---------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex:
transpose
---------
.. autofunction:: paddle.v2.fluid.layers.transpose
:noindex:
sigmoid_cross_entropy_with_logits
---------
.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
:noindex:
cast
---------
.. autofunction:: paddle.v2.fluid.layers.cast
:noindex:
concat
---------
.. autofunction:: paddle.v2.fluid.layers.concat
:noindex:
sums
---------
.. autofunction:: paddle.v2.fluid.layers.sums
:noindex:
linear_chain_crf
---------
.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
:noindex:
assign
---------
.. autofunction:: paddle.v2.fluid.layers.embedding
:noindex:
split_lod_tensor
---------
.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex:
merge_lod_tensor
---------
.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
:noindex:
cos_sim
---------
.. autofunction:: paddle.v2.fluid.layers.cos_sim
:noindex:
cross_entropy
---------
.. autofunction:: paddle.v2.fluid.layers.cross_entropy
:noindex:
square_error_cost
---------
.. autofunction:: paddle.v2.fluid.layers.square_error_cost
:noindex:
accuracy
---------
.. autofunction:: paddle.v2.fluid.layers.accuracy
:noindex:
sequence_conv
---------
.. autofunction:: paddle.v2.fluid.layers.sequence_conv
:noindex:
conv2d
---------
.. autofunction:: paddle.v2.fluid.layers.conv2d
:noindex:
sequence_pool
---------
.. autofunction:: paddle.v2.fluid.layers.sequence_pool
:noindex:
pool2d
---------
.. autofunction:: paddle.v2.fluid.layers.pool2d
:noindex:
batch_norm
---------
.. autofunction:: paddle.v2.fluid.layers.batch_norm
:noindex:
beam_search_decode
---------
.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
:noindex:
lstm
---------
.. autofunction:: paddle.v2.fluid.layers.lstm
:noindex:
lod_rank_table
---------
.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex:
max_sequence_len
---------
.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex:
topk
---------
.. autofunction:: paddle.v2.fluid.layers.topk
:noindex:
lod_tensor_to_array
---------
.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
:noindex:
array_to_lod_tensor
---------
.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
:noindex:
fill_constant
---------
.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex:
fill_constant_batch_size_like
---------
.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
:noindex:
ones
---------
.. autofunction:: paddle.v2.fluid.layers.ones
:noindex:
zeros
---------
.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex:
increment
---------
.. autofunction:: paddle.v2.fluid.layers.increment
:noindex:
array_write
---------
.. autofunction:: paddle.v2.fluid.layers.array_write
:noindex:
create_array
---------
.. autofunction:: paddle.v2.fluid.layers.create_array
:noindex:
less_than
---------
.. autofunction:: paddle.v2.fluid.layers.less_than
:noindex:
array_read
---------
.. autofunction:: paddle.v2.fluid.layers.array_read
:noindex:
shrink_memory
---------
.. autofunction:: paddle.v2.fluid.layers.shrink_memory
:noindex:
array_length
---------
.. autofunction:: paddle.v2.fluid.layers.array_length
:noindex:
conv2d_transpose
---------
.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
:noindex:
===========
Nets
===========
simple_img_conv_pool
-----------
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
:noindex:
img_conv_group
-----------
.. autofunction:: paddle.v2.fluid.nets.img_conv_group
:noindex:
sequence_conv_pool
-----------
.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
:noindex:
===========
Optimizer
===========
Optimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: Optimizer
:noindex:
SGDOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: SGDOptimizer
:noindex:
MomentumOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: MomentumOptimizer
:noindex:
AdagradOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdagradOptimizer
:noindex:
AdamOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamOptimizer
:noindex:
AdamaxOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamaxOptimizer
:noindex:
DecayedAdagradOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: DecayedAdagradOptimizer
:noindex:
===========
ParamAttr
===========
ParamAttr
-----------
.. automodule:: paddle.v2.fluid.param_attr
:members: ParamAttr
:noindex:
===========
Profiler
===========
Profiler
-----------
.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler
:noindex:
===========
Regularizer
===========
WeightDecayRegularizer
-----------
.. automodule:: paddle.v2.fluid.regularizer
:members: WeightDecayRegularizer
:noindex:
L2DecayRegularizer
-----------
.. automodule:: paddle.v2.fluid.regularizer
:members: L2DecayRegularizer
:noindex:
L1DecayRegularizer
-----------
.. automodule:: paddle.v2.fluid.regularizer
:members: L1DecayRegularizer
## Evaluator Design ## Evaluator Design
### The Problem ### Problem Statement
During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted. During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
### Evaluator Design ### Evaluator Design
Currently, every operation is expressed in the graph. we divide the evaluator process into three steps. Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
1. Initialize the metric state and add it into the block. 1. Initialize the metric state and add it into the block.
2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once. 2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices. 3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
### Implementation ### Implementation
This design is shown in python API. This design is shown in the Python API.
Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass. Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass.
```python ```python
......
...@@ -28,6 +28,51 @@ The goal of float16 is to serve as a key for the executor to find and run the co ...@@ -28,6 +28,51 @@ The goal of float16 is to serve as a key for the executor to find and run the co
- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors. - [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU). - [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
### CUDA version issue
There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0.
CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
```
typedef struct __align__(2) {
unsigned short x;
} __half;
typedef __half half;
```
This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
```
__global__ void Add() {
half a, b, c;
c = __hadd(a, b); // correct
c = a + b; // compiler error: no operator "+" matches these operands
}
```
CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
```
typedef struct __CUDA_ALIGN__(2) {
unsigned short x;
} __half_raw;
struct __CUDA_ALIGN__(2) __half {
protected:
unsigned short __x;
public:
// constructors and conversion operators from/to
// __half_raw and other built-in data types
}
typedef __half half;
__device__ __forceinline__
__half operator+(const __half &lh, const __half &rh) {
return __hadd(lh, rh);
}
// Other overloaded operators
```
This new design makes `c = a + b` work correctly for CUDA half data type.
## Implementation ## Implementation
The float16 class holds a 16-bit `uint16_t` data internally. The float16 class holds a 16-bit `uint16_t` data internally.
......
# Intel® MKL-DNN on PaddlePaddle: Design Doc # Intel® MKL-DNN on PaddlePaddle: Design Doc
我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle,充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。 我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn)
(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle,
充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。
我们短期内的基本目标是: <div align="center">
<img src="image/overview.png"><br/>
Figure 1. PaddlePaddle on IA
</div>
近期目标
- 完成常用layer的MKL-DNN实现。 - 完成常用Layer的MKL-DNN实现。
- 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。 - 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。
目前的优化,主要针对PaddlePaddle在重构之前的代码框架以及V1的API。
具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)
## Contents ## Contents
- [Overview](#overview) - [Overview](#overview)
- [Actions](#actions) - [Actions](#actions)
- [CMake](#cmake) - [CMake](#cmake)
- [Matrix](#matrix)
- [Layers](#layers) - [Layers](#layers)
- [Activations](#activations) - [Activations](#activations)
- [Weights](#weights) - [Parameters](#parameters)
- [Gradients](#gradients)
- [Unit Tests](#unit-tests) - [Unit Tests](#unit-tests)
- [Protobuf Messages](#protobuf-messages)
- [Python API](#python-api) - [Python API](#python-api)
- [Demos](#demos)
- [Benchmarking](#benchmarking) - [Benchmarking](#benchmarking)
- [Others](#others) - [Others](#others)
- [Design Concerns](#design-concerns) - [Design Concerns](#design-concerns)
## Overview ## Overview
我们会把MKL-DNN作为第三方库集成进PaddlePaddle,整体框架图 我们会把MKL-DNN会作为第三方库集成进PaddlePaddle,与其他第三方库一样,会在编译PaddlePaddle的时候下载并编译MKL-DNN。
同时,为了进一步提升PaddlePaddle在基本数学运算的计算速度,我们也将MKLML即(MKL small library\[[1](#references)\])
作为另一个第三方库集成进PaddlePaddle,它只会包括生成好的动态库和头文件。
MKL,MKLML以及MKL-DNN三者关系如下表:
| Name | Open Source | License | Descriptions |
| :---------- | :--------------- | :---------- | :------------ |
| MKL | No | Proprietary | Accelerate math processing routines |
| MKLML | No | Proprietary | Small package of MKL, especially for Machine Learning |
| MKL-DNN | Yes | Apache 2.0 | Accelerate primitives processing routines especially for Deep Neural Networks |
MKLML可以与MKL-DNN共同使用,以此达到最好的性能。
<div align="center"> <div align="center">
<img src="image/overview.png" width=350><br/> <img src="image/engine.png"><br/>
Figure 1. PaddlePaddle on IA. Figure 2. PaddlePaddle with MKL Engines
</div> </div>
## Actions ## Actions
我们把集成方案大致分为了如下几个方面。
添加的相关文件和目录结构如下:
```txt
PaddlePaddle/Paddle
├── ...
├── cmake/
│ ├── external/
│ │ ├── ...
│ │ ├── mkldnn.cmake
│ │ └── mklml.cmake
└── paddle/
├── ...
├── math/
│ ├── ...
│ └── MKLDNNMatrix.*
└── gserver/
├── ...
├── layers/
│ ├── ...
│ └── MKLDNN*Layer.*
├── activations/
│ ├── ...
│ └── MKLDNNActivations.*
└── tests/
├── ...
├── MKLDNNTester.*
└── test_MKLDNN.cpp
```
### CMake ### CMake
我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关,他是负责`WITH_MKLML``WITH_MKLDNN`的总开关。 `CMakeLists.txt`中提供一个与MKL有关的总开关:`WITH_MKL`,它负责决定编译时是否使用MKLML和MKL-DNN
当打开`WITH_MKL`时,会开启MKLML的功能,作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上,同时会开启MKL-DNN功能。 - `WITH_MKLML` 控制是否使用MKLML库。
当打开`WITH_MKL`时,会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。
编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
MKLML的库目前都是动态库,主要包括`libiomp5.so``libmklml_intel.so`
- `WITH_MKLDNN` 控制是否使用MKL-DNN。
当开启`WITH_MKL`时,会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。
编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。
MKL-DNN的库目前只有动态库`libmkldnn.so`
当关闭`WITH_MKL`时,MKLML和MKL-DNN功能会同时关闭。 ### Matrix
目前在PaddlePaddle中数据都是以`NCHW`的格式存储,但是在MKL-DNN中的排列方式不止这一种。
所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。 <div align="center">
<img src="image/matrix.png"><br/>
Figure 3. MKLDNNMatrix
</div>
### Layers ### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在 所有MKL-DNN的Layers都会继承于`MKLDNNLayer`,该类继承于PaddlePaddle的基类`Layer`
`paddle/gserver/layers`中,并且文件名都会一以*MKLDNN*开头。 `MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward``backward`的基本逻辑,
子类只需要使用定义好的接口,实现具体的函数功能即可。
<div align="center">
<img src="image/layers.png"><br/>
Figure 4. MKLDNNLayer
</div>
每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix:
所有MKL-DNN的layers都会继承于一个叫做`MKLDNNLayer`的父类,该父类继承于PaddlePaddle的基类`Layer` - 内部存储(internel memory):`inVal_`,`inGrad_`,`outVal_``outGrad_`,分别代表输入数据,输入梯度,输出数据和输出梯度。
- 外部存储(external memory):都是以ext开头,比如`extInVal_``extInGrad_`,它们主要是用于,
当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时,转换内存的工作。
需要注意的是,PaddlePaddle的activation会直接使用`output_.value``output_.grad`
所以`extOutVal_``extOutGrad_`必须分别与`output_.value``output_.grad`共享内存,
如果不需要外部存储用于转换,那么对应的内部存储也会与它们共享内存。
- 转换函数(resetXXX): 包括`resetInValue``resetInGrad``resetOutValue``resetOutGrad`
表示对输入数据,输入梯度,输出数据和输出梯度的转换。
这些函数会根据输入参数重新设置内部和外部存储,当然这两者也可以相等,即表示不需要转换。
`MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward``backward`的基本逻辑。部分函数定义为纯虚函数,子类只需要实现这些函数即可 注意:每个`MKLDNNlayer`的子类只需要使用内部存储就可以了,所有外部的转换工作都会在reset系列函数中都准备好
### Activations ### Activations
由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加`MKLDNNActivation.h``MKLDNNActivation.cpp`文件用于定义和使用MKL-DNN的接口。 在重构前的PaddlePaddle中,激活函数是独立于`Layer`的概念,并且输入输出都是共用一块内存,
所以添加了对应的`MKLDNNActivation`来实现,方式类似于`MKLDNNLayer`
### Parameters
对于有参数的层,我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。
如果存在数据排列格式不一样的情况时,我们会在网络训练之前把格式转换为MKL-DNN希望的格式,
在训练结束的时候再保存为PaddlePaddle的格式,但是整个训练过程中不需要任何转换。
这样既使得最终保存的参数格式与PaddlePaddle一致,又可以避免不必要的转换。
### Gradients
由于MKL-DNN的操作都是直接覆盖的形式,也就是说输出的结果不会在原来的数据上累加,
这样带来的好处就是不需要一直清空memory,节省了不必要的操作。
但是注意的是,当网络出现分支且在`backward`的时候,需要累加不同Layer传过来的梯度。
所以在`MKLDNNlayer`中实现了一个merge的方法,此时每个小分支的`Input Gradient`
会先临时保存在`MKLDNNMatrix`中,由分支处的Layer负责求和,并把结果放到当前层的`output_.grad`中。
所以整体上,在实现每个子类的时候就不需要关心分支的事情了。
### Weights <div align="center">
由于有些layer是含有参数的,我们会尽量让MKL-DNN的参数与PaddlePaddle中`parameter`共享一块内存。 <img src="image/gradients.png"><br/>
同时,由于MKL-DNN在训练时使用的参数layout可能与PaddlePaddle默认的`nchw`不一致,我们会在网络训练的开始和结束时分别转换这个layout,使得最终保存的参数格式与PaddlePaddle一致。 Figure 5. Merge Gradients
</div>
### Unit Tests ### Unit Tests
会在`paddle/gserver/test`目录下添加`test_MKLDNN.cpp``MKLDNNTester.*`用于MKL-DNN的测试。 我们会添加`test_MKLDNN.cpp``MKLDNNTester.*`用于MKL-DNN的测试。
测试分为每个layer(或activation)的单元测试和简单网络的整体测试。 测试分为每个Layer(或Activation)的单元测试和简单网络的整体测试。
每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果,小于某个比较小的阈值认为通过。 每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果,小于某个比较小的阈值认为通过。
### Protobuf Messages
根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
### Python API ### Python API
目前只考虑**v1 API** 目前只考虑**v1 API**
...@@ -80,41 +172,40 @@ if use_mkldnn ...@@ -80,41 +172,40 @@ if use_mkldnn
self.layer_type = mkldnn_* self.layer_type = mkldnn_*
``` ```
所有MKL-DNN的layer type会以*mkldnn_*开头,以示区分。 所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。
并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py ``layers.py`里面添加必要的MKL-DNN的接口。
### Demos 同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。
### Benchmarking ### Benchmarking
会添加`benchmark/paddle/image/run_mkldnn.sh`,用于测试使用MKL-DNN之后的性能。 会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image),用于测试和对比在使用MKL-DNN前后的CNN网络性能。
测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md)
### Others ### Others
1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64 1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为4096,具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)
2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。 2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。
## Design Concerns ## Design Concerns
为了更好的符合PaddlePaddle的代码风格\[[2](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\] 为了更好的符合PaddlePaddle的代码风格\[[3](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]
我们总结出一些特别需要注意的点: 我们总结出一些特别需要注意的点:
1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2``MKLDNNLayer`特有的设备ID。 1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,
我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2``MKLDNNLayer`特有的设备ID。
2. 重写父类Layer的**init**函数,修改`deviceId_``-2`,代表这个layer是用于跑在MKL-DNN的环境下。 2. 重写父类Layer的**init**函数,修改`deviceId_``-2`,代表这个layer是用于跑在MKL-DNN的环境下。
3. 创建`MKLDNNMatrix`,同时继承`CpuMatrix``mkldnn::memory`。用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。 3. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。
4. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MKLDNNStream``CPUEngine`,和未来可能还会用到`FPGAEngine`等。 包括MKL-DNN会用到`MKLDNNStream``CPUEngine`,和未来可能还会用到`FPGAEngine`等。
5. 每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_``outGrad_`,分别代表input value, input gradient,output value和output gradient。他们会存放MKL-DNN用到的internal memory。同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory),主要是在格式与PaddlePaddle默认的`nchw`格式不匹配时,用于转换内存的工作。必要的转换函数也会在`MKLDNNLayer`中提前定义好,每个子类只需要调用定义好的reset buffer函数即可。 4. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value``extOutVal_`共享内存,
6. 每个`MKLDNNlayer`的resetbuffer相关的函数(包括reset input、output的Value和grad),他们会根据输入参数reset internal和external的memory,当然这两者也可以相等,即表示不需要转换。只需要把握一个原则,每个`MKLDNNlayer`的子类,只需要使用internal的memory就可以了,所有external的转换工作在父类的reset函数中都提前准备好了。 同时数据格式就是`NCHW`,这样下一个cpu device就能拿到正确的数据。
7. 一般来说,external的memory会尽量与PaddlePaddle中的`value``grad`共享内存。同时每个`MKLDNNLayer`中的external output value和gradient(也就是`extOutVal_``extOutGrad_`)必须分别与`output_.value``output_.grad`共享内存,因为PaddlePaddle的activation会直接使用`output_.value``output_.grad`。如果不需要external的buffer用于转换,那么internal的buffer也会与他们共享内存。 在有普通的CPU layer时, `extOutVal_``extOutGrad_`的格式始终是`NCHW`或者`NC`
8. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value``extOutVal_`共享内存,同时数据格式就是`nchw`,这样下一个cpu device就能拿到正确的数据。在有cpu device的时候,external的memory的格式始终是`nchw`或者`nc`
9. 由于MKL-DNN的输出操作都是覆盖data的,不是在原来的数据上累加,所以当网络出现分支时,在`backward`时会需要merge不同layer的梯度。`MKLDNNlayer`中会实现merge的方法,此时每个小分支的input gradient会先临时保存在一个`MKLDNNMatrix`中,由分支处的layer负责求和,并把结果放到这个layer的`output_.grad`中。所以整体上,每个子类并不会需要关心分支的事情,也是在父类都实现好了。
10. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
## References ## References
1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN") 主要包括了深度学习相关的数学原语与操作,一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。 2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)
3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`,所以不存在这个问题),所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。 目前在PaddlePaddle中,仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。
4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`,所以不存在这个问题)。
所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。
...@@ -2,106 +2,70 @@ ...@@ -2,106 +2,70 @@
## Abstract ## Abstract
PaddlePaddle v0.10.0 uses the "trainer-parameter server" PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
architecture. We run multiple replicated instances of trainers (runs
the same code written by the user) and parameter servers for
distributed training. This architecture served us well, but has some
limitations:
1. Need to write special code to handle tasks which should only be run 1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
by a single trainer. E.g., initializing model and saving model.
2. Model parallelism is hard: need to write if-else branches conditioned 2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
on the trainer ID to partition model onto each trainer, and manually
write the inter-model-shard communication code.
3. The user can not directly specify the parameter update rule: need 3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
to modify the parameter server C++ code and compile a new
binary. This adds complication for researchers: A lot of extra
effort is required. Besides, the training job submission program
may not allow running arbitrary binaries.
This design doc discusses PaddlePaddle's new distributed training This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
architecture that addresses the above limitations.
## Analysis ## Analysis
We will assume the user writes the trainer program by Python, the same The assumption is that the user writes the trainer program in either Python or C++.
analysis holds if the trainer program is written in C++.
### Limitation 1 ### Limitation 1
If we look at the Python code that the user writes, there are two There are two basic functionalities in the trainer program:
kinds of functionalities:
- The training logic such as load / save model and print log. 1. The training logic such as loading / saving the model and printing out the logs.
- The neural network definition such as the definition of the data 2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
layer, the fully connected layer, the cost function and the
optimizer. optimizer.
When we training with PaddlePaddle v0.10.0 distributedly, multiple When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
replicated Python instances are running on different nodes: both the training logic as well as the neural network computation logic, is replicated.
training logic and the neural network computation is replicated.
The tasks that should only run once all belong to the training logic, The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
if we only replicate the neural network computation, but do **not** replicate the training logic, the limitation mentioned above can be avoided.
replicate the training logic, the limitation could be solved.
### Limitation 2 ### Limitation 2
Model parallelism means running a single model on multiple nodes by Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
partitioning the model onto different nodes and managing the inter-model-shard communication between nodes.
inter-model-shard communications.
PaddlePaddle should be able to modify the nerual network computation PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
definition to support model parallelism automatically. However, the computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
computation is only specified in Python code, and PaddlePaddle can not
modify Python code.
Just like compiler uses a intermediate representation (IR) so that Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
programmer does not need to manually optimize their code in most of
the cases - the compiler will optimize the IR:
<img src="src/compiler.png"/> <img src="src/compiler.png"/>
We can have our own IR too: PaddlePaddle can support model parallel by PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
converting the IR so the user no longer need to manually do it in
Python:
<img src="src/paddle-compile.png"/> <img src="src/paddle-compile.png"/>
The IR for PaddlePaddle after refactor is called `Block`, it specifies The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
the computation dependency graph and the variables used in the
computation.
### Limitation 3 ### Limitation 3
The user can not directly specify the parameter update rule for the The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
parameter server because the parameter server does not use the same
computation definition as the trainer. Instead, the update rule is
baked in the parameter server. The user can not specify the update
rule in the same way of specifying the trainer computation.
This could be fixed by making the parameter server run the same This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
computation definition as the trainer. For a detailed explanation,
please
see
[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md) [Design Doc: Operation Graph Based Parameter Server](./parameter_server.md)
## Distributed Training Architecture ## Distributed Training Architecture
The new distributed training architecture can address the above The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
limitations. Below is the illustration:
<img src="src/distributed_architecture.png"/> <img src="src/distributed_architecture.png"/>
The architecture includes major components: *PaddlePaddle Python*, The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
*PaddlePaddle converter* and *PaddlePaddle runtime*:
### PaddlePaddle Python ### PaddlePaddle Python
PaddlePaddle Python is the Python library that user's Python trainer PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
invoke to build the neural network topology, start training, etc.
```Python ```Python
paddle.init() paddle.init()
...@@ -117,102 +81,60 @@ for i in range(1000): ...@@ -117,102 +81,60 @@ for i in range(1000):
print cost_val print cost_val
``` ```
The code above is a typical Python trainer code, the neural network The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
topology is built using helper functions such as
`paddle.layer.fc`. The training is done by calling `session.eval`
iteratively.
#### session.eval #### session.eval
As shown in the graph, `session.eval` sends the IR and the evaluation As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
inputs/targets to the PaddlePaddle cluster for evaluation. The The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
targets can be any variable in the computation graph. When the target
is the `optimizer` variable, the neural network will be optimized
once. When the target is the `cost` variable, `session.eval` returns
the cost value.
The Python `session` is a wrapper of the C++ `Session` class. For more The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
information about `Session`, please
see [Design Doc: Session](./session.md).
### PaddlePaddle Converter ### PaddlePaddle Converter
PaddlePaddle converter automatically converts the IR in the request The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
(IR and evaluation inputs/targets) from PaddlePaddle Python to new
partitioned IRs and dispatch the new IRs and evaluation inputs/targets
to different PaddlePaddle runtimes. Below are the steps:
1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that 1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
fetches the eval targets to the IR.
1. Extract a new computation (sub)graph with `feed` and `fetch` OP as 2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
the boundary. The runtime does not need to run the OP that is not
dependent by the `fetch` OP.
1. Optimizes the computation graph. 3. Optimize the computation graph.
1. Place the OPs in the graph onto different devices on different 4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
PaddlePaddle runtime according to a placement algorithm and device
constraint specified by the user.
1. Partition the graph according to runtime boundaries and add `send` / 5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
`recv` OP pair on the runtime boundaries.
1. Dispatch the partitioned graph to different PaddlePaddle runtimes. 6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
1. PaddlePaddle runtimes with the `fetch` OP reports evaluation 7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
results back to the converter, the convert reports the evaluation
results back to the PaddlePaddle Python.
The output IRs will be cached to optimize the conversion latency. The output IRs will be cached to optimize the conversion latency.
#### Placement Algorithm #### Placement Algorithm
Our first implementation will only support "trainer-parameter server" Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
placement: the parameters, initializers, and optimizers are placed on
the PaddlePaddle runtimes with the parameter server role. And
everything else will be placed on the PaddlePaddle runtimes with the
trainer role. This has the same functionality of our
"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
is more general and flexible.
In the future, we will implement the general placement algorithm, In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
which makes placements according to the input IR, and a model of
device computation time and device communication time. Model
parallelism requires the general placement algorithm.
### PaddlePaddle Runtime ### PaddlePaddle Runtime
The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
runs the IR. The runtime does not need to do OP placement since it's
already done by the converter.
### Local Training Architecture ### Local Training Architecture
The local training architecture will be the same as the distributed The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
training architecture, the differences are everything runs locally,
and there is just one PaddlePaddle runtime:
<img src="src/local_architecture.png"/> <img src="src/local_architecture.png"/>
### Training Data ### Training Data
In PaddlePaddle v0.10.0, training data is typically read In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
with [data reader](../reader/README.md) from Python. This approach is
no longer efficient when training distributedly since the Python When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
process no longer runs on the same node with the trainer processes,
the Python reader will need to read from the distributed filesystem
(assuming it has the access) and send to the trainers, doubling the
network traffic.
When doing distributed training, the user can still use Python data
reader: the training data are sent with `session.eval`. However should
be used for debugging purpose only. The users are encouraged to use
the read data OPs.
## References: ## References:
......
从源码编译PaddlePaddle 从源码编译
====================== ======================
.. _build_step: .. _build_step:
...@@ -7,8 +7,11 @@ ...@@ -7,8 +7,11 @@
---------------- ----------------
PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。 PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
我们推荐您使用PaddlePaddle编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境 我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像
可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。 可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
编译PaddlePaddle,需要执行: 编译PaddlePaddle,需要执行:
.. code-block:: bash .. code-block:: bash
...@@ -23,7 +26,6 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译 ...@@ -23,7 +26,6 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF .. cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
make make
编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装: 编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装:
.. code-block:: bash .. code-block:: bash
...@@ -31,7 +33,33 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译 ...@@ -31,7 +33,33 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
pip install python/dist/*.whl pip install python/dist/*.whl
.. _build_step: .. _run_test:
执行单元测试
----------------
如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法:
使用Docker的情况下,设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。
开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
如果不使用Docker,可以执行ctest命令即可:
.. code-block:: bash
mkdir build
cd build
cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
make
ctest
# 指定执行其中一个单元测试 test_mul_op
ctest -R test_mul_op
.. _compile_deps:
编译依赖 编译依赖
---------------- ----------------
......
Build PaddlePaddle from Sources Build from Sources
========================== ==========================
.. _build_step: .. _build_step:
...@@ -9,14 +9,18 @@ How To Build ...@@ -9,14 +9,18 @@ How To Build
PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
tools. We recommend you to use our pre-built Docker image to run the build tools. We recommend you to use our pre-built Docker image to run the build
to avoid installing dependencies by yourself. We have several build environment to avoid installing dependencies by yourself. We have several build environment
Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_. Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
If you choose not to use Docker image for your build, you need to install the
below `Compile Dependencies`_ before run the build.
Then run: Then run:
.. code-block:: bash .. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle cd Paddle
# run the following command to build CPU-Only binaries if you are using docker # run the following command to build a CPU-Only binaries if you are using docker
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
# else run these commands # else run these commands
mkdir build mkdir build
...@@ -32,7 +36,35 @@ machine or copy it to the target machine. ...@@ -32,7 +36,35 @@ machine or copy it to the target machine.
pip install python/dist/*.whl pip install python/dist/*.whl
.. _build_step:
.. _run_test:
Run Tests
----------------
If you wish to run the tests, you may follow the below steps:
When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
Set :code:`WITH_GPU=ON` Can also run tests on GPU.
.. code-block:: bash
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
If you don't use Docker, just run ctest will start the tests:
.. code-block:: bash
mkdir build
cd build
cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
make
ctest
# run a single test like test_mul_op
ctest -R test_mul_op
.. _compile_deps:
Compile Dependencies Compile Dependencies
---------------- ----------------
......
使用Docker安装运行PaddlePaddle 使用Docker安装运行
================================ ================================
使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。 使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
......
PaddlePaddle in Docker Containers Run in Docker Containers
================================= =================================
Run PaddlePaddle in Docker container so that you don't need to care about Run PaddlePaddle in Docker container so that you don't need to care about
......
使用pip安装PaddlePaddle 使用pip安装
================================ ================================
PaddlePaddle可以使用常用的Python包管理工具 PaddlePaddle可以使用常用的Python包管理工具
...@@ -34,7 +34,7 @@ PaddlePaddle可以使用常用的Python包管理工具 ...@@ -34,7 +34,7 @@ PaddlePaddle可以使用常用的Python包管理工具
:align: center :align: center
.. csv-table:: 各个版本最新的whl包 .. csv-table:: 各个版本最新的whl包
:header: "版本说明", "cp27-cp27mu", "cp27-cp27mu", "C-API" :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
:widths: 1, 3, 3, 3 :widths: 1, 3, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
......
Install PaddlePaddle Using pip Install Using pip
================================ ================================
You can use current widely used Python package management You can use current widely used Python package management
...@@ -37,7 +37,7 @@ If the links below shows up the login form, just click "Log in as guest" to star ...@@ -37,7 +37,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
:align: center :align: center
.. csv-table:: whl package of each version .. csv-table:: whl package of each version
:header: "version", "cp27-cp27mu", "cp27-cp27mu", "C-API" :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
:widths: 1, 3, 3, 3 :widths: 1, 3, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
......
...@@ -3,12 +3,64 @@ ...@@ -3,12 +3,64 @@
################## ##################
PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
如何构建文档 如何构建文档
============ ============
PaddlePaddle的文档构建有两种方式。 PaddlePaddle的文档构建有三种方式。
使用PaddlePaddle.org工具
--------------
这个是目前推荐的使用方法。除了可以自动编译文档,也可以直接在网页预览文档。
文件工具是使用Docker,需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具
.. code-block:: bash
mkdir paddlepaddle # Create paddlepaddle working directory
cd paddlepaddle
# Clone the content repositories
git clone https://github.com/PaddlePaddle/Paddle.git
git clone https://github.com/PaddlePaddle/book.git
git clone https://github.com/PaddlePaddle/models.git
git clone https://github.com/PaddlePaddle/Mobile.git
# Please specify the working directory through -v
docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
如果不想使用 Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
.. code-block:: bash
mkdir paddlepaddle # Create paddlepaddle working directory
cd paddlepaddle
# Clone the content repositories and PaddlePaddle.org
git clone https://github.com/PaddlePaddle/Paddle.git
git clone https://github.com/PaddlePaddle/book.git
git clone https://github.com/PaddlePaddle/models.git
git clone https://github.com/PaddlePaddle/Mobile.git
git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
# Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
export ENV=''
cd PaddlePaddle.org/portal/
pip install -r requirements.txt
python manage.py runserver
工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
使用Docker构建 使用Docker构建
-------------- --------------
...@@ -47,17 +99,12 @@ PaddlePaddle的文档构建有两种方式。 ...@@ -47,17 +99,12 @@ PaddlePaddle的文档构建有两种方式。
PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程进行书写。 PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程进行书写。
如何更新文档主题 如何更新www.paddlepaddle.org
================
PaddlePaddle文档主题在 `TO_YOUR_PADDLE_CLONE_PATH/doc_theme` 文件夹下,包含所有和前端网页设计相关的文件。
如何更新doc.paddlepaddle.org
============================ ============================
更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。 更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和 目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。 `英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
.. _cmake: https://cmake.org/ .. _cmake: https://cmake.org/
......
##################
Contribute Documentation
##################
PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
How to Build Documentations
============
We recommend using PaddlePaddle.org tool to build documentation
Use PaddlePaddle.org tool
--------------
This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
.. code-block:: bash
mkdir paddlepaddle # Create paddlepaddle working directory
cd paddlepaddle
# Clone the content repositories. You may only clone the contents you need
git clone https://github.com/PaddlePaddle/Paddle.git
git clone https://github.com/PaddlePaddle/book.git
git clone https://github.com/PaddlePaddle/models.git
git clone https://github.com/PaddlePaddle/Mobile.git
# Please specify the working directory through -v
docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up
.. code-block:: bash
mkdir paddlepaddle # Create paddlepaddle working directory
cd paddlepaddle
# Clone the content repositories and PaddlePaddle.org
git clone https://github.com/PaddlePaddle/Paddle.git
git clone https://github.com/PaddlePaddle/book.git
git clone https://github.com/PaddlePaddle/models.git
git clone https://github.com/PaddlePaddle/Mobile.git
git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
# Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
export ENV=''
cd PaddlePaddle.org/portal/
pip install -r requirements.txt
python manage.py runserver
Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
If you want to learn more on the PaddlePaddle.org, please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ 。
How to write Documentations
============
PaddlePaddle uses `sphinx`_ to compile documentations,Please check sphinx official website for more detail.
How to update www.paddlepaddle.org
============================
Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
`English Docs <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
.. _cmake: https://cmake.org/
.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
dev/build_cn.rst dev/contribute_to_paddle_cn.md
dev/write_docs_cn.rst dev/write_docs_cn.rst
模型配置 模型配置
......
...@@ -18,9 +18,9 @@ Development ...@@ -18,9 +18,9 @@ Development
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
dev/build_en.rst
dev/new_layer_en.rst dev/new_layer_en.rst
dev/contribute_to_paddle_en.md dev/contribute_to_paddle_en.md
dev/write_docs_en.rst
Configuration Configuration
------------- -------------
......
此教程会介绍如何使用Python的cProfile包,与Python库yep,google perftools来运行性能分析(Profiling)与调优。 This tutorial introduces techniques we use to profile and tune the
CPU performance of PaddlePaddle. We will use Python packages
`cProfile` and `yep`, and Google's `perftools`.
运行性能分析可以让开发人员科学的,有条不紊的对程序进行性能优化。性能分析是性能调优的基础。因为在程序实际运行中,真正的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。 Profiling is the process that reveals performance bottlenecks,
which could be very different from what's in the developers' mind.
Performance tuning is done to fix these bottlenecks. Performance optimization
repeats the steps of profiling and tuning alternatively.
性能优化的步骤,通常是循环重复若干次『性能分析 --> 寻找瓶颈 ---> 调优瓶颈 --> 性能分析确认调优效果』。其中性能分析是性能调优的至关重要的量化指标。 PaddlePaddle users program AI applications by calling the Python API, which calls
into `libpaddle.so.` written in C++. In this tutorial, we focus on
the profiling and tuning of
Paddle提供了Python语言绑定。用户使用Python进行神经网络编程,训练,测试。Python解释器通过`pybind``swig`调用Paddle的动态链接库,进而调用Paddle C++部分的代码。所以Paddle的性能分析与调优分为两个部分: 1. the Python code and
1. the mixture of Python and C++ code.
* Python代码的性能分析 ## Profiling the Python Code
* Python与C++混合代码的性能分析
### Generate the Performance Profiling File
## Python代码的性能分析 We can use Python standard
package, [`cProfile`](https://docs.python.org/2/library/profile.html),
### 生成性能分析文件 to generate Python profiling file. For example:
Python标准库中提供了性能分析的工具包,[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
```bash ```bash
python -m cProfile -o profile.out main.py python -m cProfile -o profile.out main.py
``` ```
其中`-o`标识了一个输出的文件名,用来存储本次性能分析的结果。如果不指定这个文件,`cProfile`会打印一些统计信息到`stdout`。这不方便我们进行后期处理(进行`sort`, `split`, `cut`等等)。 where `main.py` is the program we are going to profile, `-o` specifies
the output file. Without `-o`, `cProfile` would outputs to standard
### 查看性能分析文件 output.
当main.py运行完毕后,性能分析结果文件`profile.out`就生成出来了。我们可以使用[cprofilev](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务,将性能分析结果以网页的形式展示出来。 ### Look into the Profiling File
使用`pip install cprofilev`安装`cprofilev`工具。安装完成后,使用如下命令开启HTTP服务 `cProfile` generates `profile.out` after `main.py` completes. We can
use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into
the details:
```bash ```bash
cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
``` ```
其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。 where `-a` specifies the HTTP IP, `-p` specifies the port, `-f`
specifies the profiling file, and `main.py` is the source file.
访问对应网址,即可显示性能分析的结果。性能分析结果格式如下: Open the Web browser and points to the local IP and the specifies
port, we will see the output like the following:
```text ```
ncalls tottime percall cumtime percall filename:lineno(function) ncalls tottime percall cumtime percall filename:lineno(function)
1 0.284 0.284 29.514 29.514 main.py:1(<module>) 1 0.284 0.284 29.514 29.514 main.py:1(<module>)
4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run) 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
...@@ -44,23 +54,23 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py ...@@ -44,23 +54,23 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>) 1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
``` ```
每一列的含义是: where each line corresponds to Python function, and the meaning of
each column is as follows:
| 列名 | 含义 | | column | meaning |
| --- | --- | | --- | --- |
| ncalls | 函数的调用次数 | | ncalls | the number of calls into a function |
| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 | | tottime | the total execution time of the function, not including the
| percall | tottime的每次调用平均时间 | execution time of other functions called by the function |
| cumtime | 函数总时间。包含这个函数调用其他函数的时间 | | percall | tottime divided by ncalls |
| percall | cumtime的每次调用平均时间 | | cumtime | the total execution time of the function, including the execution time of other functions being called |
| filename:lineno(function) | 文件名, 行号,函数名 | | percall | cumtime divided by ncalls |
| filename:lineno(function) | where the function is defined |
### Identify Performance Bottlenecks
### 寻找性能瓶颈 Usually, `tottime` and the related `percall` time is what we want to
focus on. We can sort above profiling file by tottime:
通常`tottime``cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
将性能分析结果按照tottime排序,效果如下:
```text ```text
4696 12.040 0.003 12.040 0.003 {built-in method run} 4696 12.040 0.003 12.040 0.003 {built-in method run}
...@@ -68,12 +78,15 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py ...@@ -68,12 +78,15 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__) 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>) 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
``` ```
可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python与C++混合代码的性能分析`来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。 We can see that the most time-consuming function is the `built-in
method run`, which is a C++ function in `libpaddle.so`. We will
explain how to profile C++ code in the next section. At this
moment, let's look into the third function `sync_with_cpp`, which is a
Python function. We can click it to understand more about it:
```text ```
Called By: Called By:
Ordered by: internal time Ordered by: internal time
...@@ -92,72 +105,93 @@ Called: ...@@ -92,72 +105,93 @@ Called:
List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
``` ```
通常观察热点函数间的调用关系,和对应行的代码,就可以了解到问题代码在哪里。当我们做出性能修正后,再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。 The lists of the callers of `sync_with_cpp` might help us understand
how to improve the function definition.
## Profiling Python and C++ Code
### Generate the Profiling File
## Python与C++混合代码的性能分析 To profile a mixture of Python and C++ code, we can use a Python
package, `yep`, that can work with Google's `perftools`, which is a
commonly-used profiler for C/C++ code.
### 生成性能分析文件 In Ubuntu systems, we can install `yep` and `perftools` by running the
following commands:
C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
使用`yep`前需要安装`google-perftools``yep`包。ubuntu下安装命令为
```bash ```bash
apt update
apt install libgoogle-perftools-dev apt install libgoogle-perftools-dev
pip install yep pip install yep
``` ```
安装完毕后,我们可以通过 Then we can run the following command
```bash ```bash
python -m yep -v main.py python -m yep -v main.py
``` ```
生成性能分析文件。生成的性能分析文件为`main.py.prof` to generate the profiling file. The default filename is
`main.py.prof`.
Please be aware of the `-v` command line option, which prints the
analysis results after generating the profiling file. By examining the
the print result, we'd know that if we stripped debug
information from `libpaddle.so` at build time. The following hints
help make sure that the analysis results are readable:
命令行中的`-v`指定在生成性能分析文件之后,在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同,编译时可能会去掉调试信息,运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果,可以采取下面几点措施: 1. Use GCC command line option `-g` when building `libpaddle.so` so to
include the debug information. The standard building system of
PaddlePaddle is CMake, so you might want to set
`CMAKE_BUILD_TYPE=RelWithDebInfo`.
1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo` 1. Use GCC command line option `-O2` or `-O3` to generate optimized
2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。 binary code. It doesn't make sense to profile `libpaddle.so`
3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟如果单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。 without optimization, because it would anyway run slowly.
### 查看性能分析文件 1. Profiling the single-threaded binary file before the
multi-threading version, because the latter often generates tangled
profiling analysis result. You might want to set environment
variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically
starting multiple threads.
在运行完性能分析后,会生成性能分析结果文件。我们可以使用[pprof](https://github.com/google/pprof)来显示性能分析结果。注意,这里使用了用`Go`语言重构后的`pprof`,因为这个工具具有web服务界面,且展示效果更好。 ### Examining the Profiling File
安装`pprof`的命令和一般的`Go`程序是一样的,其命令如下: The tool we used to examine the profiling file generated by
`perftools` is [`pprof`](https://github.com/google/pprof), which
provides a Web-based GUI like `cprofilev`.
We can rely on the standard Go toolchain to retrieve the source code
of `pprof` and build it:
```bash ```bash
go get github.com/google/pprof go get github.com/google/pprof
``` ```
进而我们可以使用如下命令开启一个HTTP服务: Then we can use it to profile `main.py.prof` generated in the previous
section:
```bash ```bash
pprof -http=0.0.0.0:3213 `which python` ./main.py.prof pprof -http=0.0.0.0:3213 `which python` ./main.py.prof
``` ```
这行命令中,`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径,进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。 Where `-http` specifies the IP and port of the HTTP service.
Directing our Web browser to the service, we would see something like
访问对应的网址,我们可以查看性能分析的结果。结果如下图所示: the following:
![result](./pprof_1.png) ![result](./pprof_1.png)
### Identifying the Performance Bottlenecks
### 寻找性能瓶颈 Similar to how we work with `cprofilev`, we'd focus on `tottime` and
`cumtime`.
与寻找Python代码的性能瓶颈类似,寻找Python与C++混合代码的性能瓶颈也是要看`tottime``cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
例如下图中,
![kernel_perf](./pprof_2.png) ![kernel_perf](./pprof_2.png)
在一次训练中,乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然,`MomentumOp`的性能有问题。 We can see that the execution time of multiplication and the computing
of the gradient of multiplication takes 2% to 4% of the total running
`pprof`中,对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题,再检查其他部分的性能问题,可以更有次序的完成性能的优化。 time, and `MomentumOp` takes about 17%. Obviously, we'd want to
optimize `MomentumOp`.
## 总结
至此,两种性能分析的方式都介绍完毕了。希望通过这两种性能分析的方式,Paddle的开发人员和使用人员可以有次序的,科学的发现和解决性能问题。 `pprof` would mark performance critical parts of the program in
red. It's a good idea to follow the hints.
此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优(performance tuning)。
Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
* Python 代码的性能分析
* Python 与 C++ 混合代码的性能分析
## Python代码的性能分析
### 生成性能分析文件
Python标准库中提供了性能分析的工具包,[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
```bash
python -m cProfile -o profile.out main.py
```
其中 `main.py` 是我们要分析的程序,`-o`标识了一个输出的文件名,用来存储本次性能分析的结果。如果不指定这个文件,`cProfile`会打印到标准输出。
### 查看性能分析文件
`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务,将性能分析结果以网页的形式展示出来:
```bash
cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
```
其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
用Web浏览器访问对应网址,即可显示性能分析的结果:
```
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.284 0.284 29.514 29.514 main.py:1(<module>)
4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
4696 12.040 0.003 12.040 0.003 {built-in method run}
1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
```
每一列的含义是:
| 列名 | 含义 |
| --- | --- |
| ncalls | 函数的调用次数 |
| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
| percall | tottime的每次调用平均时间 |
| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
| percall | cumtime的每次调用平均时间 |
| filename:lineno(function) | 文件名, 行号,函数名 |
### 寻找性能瓶颈
通常`tottime``cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
将性能分析结果按照tottime排序,效果如下:
```text
4696 12.040 0.003 12.040 0.003 {built-in method run}
300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
```
可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python``C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。
```text
Called By:
Ordered by: internal time
List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
Function was called by...
ncalls tottime cumtime
/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
Called:
Ordered by: internal time
List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
```
通常观察热点函数间的调用关系,和对应行的代码,就可以了解到问题代码在哪里。当我们做出性能修正后,再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
## Python与C++混合代码的性能分析
### 生成性能分析文件
C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
使用`yep`前需要安装`google-perftools``yep`包。ubuntu下安装命令为
```bash
apt update
apt install libgoogle-perftools-dev
pip install yep
```
安装完毕后,我们可以通过
```bash
python -m yep -v main.py
```
生成性能分析文件。生成的性能分析文件为`main.py.prof`
命令行中的`-v`指定在生成性能分析文件之后,在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同,编译时可能会去掉调试信息,运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果,可以采取下面几点措施:
1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`
2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
### 查看性能分析文件
在运行完性能分析后,会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意,这里使用了用`Go`语言重构后的`pprof`,因为这个工具具有web服务界面,且展示效果更好。
安装`pprof`的命令和一般的`Go`程序是一样的,其命令如下:
```bash
go get github.com/google/pprof
```
进而我们可以使用如下命令开启一个HTTP服务:
```bash
pprof -http=0.0.0.0:3213 `which python` ./main.py.prof
```
这行命令中,`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径,进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
访问对应的网址,我们可以查看性能分析的结果。结果如下图所示:
![result](./pprof_1.png)
### 寻找性能瓶颈
与寻找Python代码的性能瓶颈类似,寻找Python与C++混合代码的性能瓶颈也是要看`tottime``cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
例如下图中,
![kernel_perf](./pprof_2.png)
在一次训练中,乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然,`MomentumOp`的性能有问题。
`pprof`中,对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题,再检查其他部分的性能问题,可以更有次序的完成性能的优化。
...@@ -4,6 +4,16 @@ else () ...@@ -4,6 +4,16 @@ else ()
set(PADDLE_FLOAT_TYPE float) set(PADDLE_FLOAT_TYPE float)
endif() endif()
execute_process(
COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_GIT_COMMIT
RESULT_VARIABLE PADDLE_GIT_COMMIT_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT PADDLE_GIT_COMMIT)
set(PADDLE_GIT_COMMIT "no commit information")
endif()
# config.h used for C-API. It will store Paddle building configuration as a # config.h used for C-API. It will store Paddle building configuration as a
# header. Make user just include PaddleCAPI.h then can get building # header. Make user just include PaddleCAPI.h then can get building
# configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their # configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their
......
...@@ -3,6 +3,9 @@ ...@@ -3,6 +3,9 @@
typedef @PADDLE_FLOAT_TYPE@ paddle_real; typedef @PADDLE_FLOAT_TYPE@ paddle_real;
#define __PADDLE_VERSION__ "@PADDLE_VERSION@"
#define __PADDLE_COMMIT__ "@PADDLE_GIT_COMMIT@"
// Since we only support linux and macos in compile, always use clang or // Since we only support linux and macos in compile, always use clang or
// gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below. // gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
#define PD_API __attribute__((visibility("default"))) #define PD_API __attribute__((visibility("default")))
......
...@@ -27,6 +27,18 @@ ...@@ -27,6 +27,18 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
// Control Flow operators's backward is significantly different from
// computational operators. Hack Code here.
// We should design a better way to backward CtrlFlowOps.
static std::unordered_set<std::string>& CtrlFlowOps() {
if (g_ctrl_flow_ops_ == nullptr) {
g_ctrl_flow_ops_ =
new std::unordered_set<std::string>{"increment", "lod_rank_table"};
}
return *g_ctrl_flow_ops_;
}
static inline std::unique_ptr<OperatorBase> CreateGradOp( static inline std::unique_ptr<OperatorBase> CreateGradOp(
const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set, const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
std::unordered_map<std::string, std::string>* grad_to_var) { std::unordered_map<std::string, std::string>* grad_to_var) {
...@@ -288,12 +300,24 @@ static void CreateGradVarInBlock( ...@@ -288,12 +300,24 @@ static void CreateGradVarInBlock(
for (size_t op_index = grad_op_start_index; op_index < ops.size(); for (size_t op_index = grad_op_start_index; op_index < ops.size();
++op_index) { ++op_index) {
std::unordered_set<std::string> new_vars; std::unordered_set<std::string> new_vars;
auto& ctrl_flow_ops = CtrlFlowOps();
ForEachVarName(ops[op_index]->Outputs(), ForEachVarName(ops[op_index]->Outputs(),
[&](const std::string& grad_var_name) { [&](const std::string& grad_var_name) {
if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
ctrl_flow_ops.end()) {
if (block_desc->HasVarRecursive(grad_var_name)) {
return false;
}
} else {
if (block_desc->HasVar(grad_var_name)) { if (block_desc->HasVar(grad_var_name)) {
return false; return false;
} }
}
if (grad_var_name == framework::kEmptyVarName) {
return false;
}
auto var = block_desc->Var(grad_var_name); auto var = block_desc->Var(grad_var_name);
VLOG(10) << "Creating Variable " << grad_var_name;
new_vars.insert(var->Name()); new_vars.insert(var->Name());
auto it = param_name_map.find(grad_var_name); auto it = param_name_map.find(grad_var_name);
if (it == param_name_map.end()) { if (it == param_name_map.end()) {
...@@ -333,14 +357,25 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad( ...@@ -333,14 +357,25 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
// All input gradients of forwarding operator do not need to calculate. // All input gradients of forwarding operator do not need to calculate.
const std::vector<std::string>& inputs = op_desc->InputArgumentNames(); const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
if (AllGradInSet(inputs, *no_grad_vars)) { if (AllGradInSet(inputs, *no_grad_vars)) {
VLOG(10) << "Drop operator " << op_desc->Type();
return grad_op_descs; // empty vector return grad_op_descs; // empty vector
} }
// All output gradients of forwarding operator do not need to calculate. // All output gradients of forwarding operator do not need to calculate.
const std::vector<std::string>& outputs = op_desc->OutputArgumentNames(); const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
if (AllGradInSet(outputs, *no_grad_vars)) { if (AllGradInSet(outputs, *no_grad_vars)) {
VLOG(10) << "Drop operator " << op_desc->Type();
// FIXME: Hack code here
auto& ctrl_flow_ops = CtrlFlowOps();
if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
// Only computational op need drop input's gradient.
for (const std::string& name : inputs) { for (const std::string& name : inputs) {
no_grad_vars->insert(GradVarName(name)); no_grad_vars->insert(GradVarName(name));
VLOG(10) << " Also drop " << GradVarName(name);
}
} }
return grad_op_descs; // empty vector return grad_op_descs; // empty vector
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/framework/block_desc.h" #include "paddle/framework/block_desc.h"
#include "paddle/framework/operator.h"
#include "paddle/framework/program_desc.h" #include "paddle/framework/program_desc.h"
namespace paddle { namespace paddle {
...@@ -42,6 +43,8 @@ bool BlockDescBind::HasVar(const std::string &name) const { ...@@ -42,6 +43,8 @@ bool BlockDescBind::HasVar(const std::string &name) const {
} }
VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const { VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
if (name == kEmptyVarName) return nullptr;
auto it = vars_.find(name); auto it = vars_.find(name);
if (it == vars_.end()) { if (it == vars_.end()) {
return Parent() == kNoneBlockIndex ? nullptr return Parent() == kNoneBlockIndex ? nullptr
......
...@@ -97,6 +97,10 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id, ...@@ -97,6 +97,10 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
if (create_local_scope) { if (create_local_scope) {
local_scope = &scope->NewScope(); local_scope = &scope->NewScope();
for (auto& var : block.AllVars()) { for (auto& var : block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
}
if (var->Persistable()) { if (var->Persistable()) {
auto* ptr = scope->Var(var->Name()); auto* ptr = scope->Var(var->Name());
CreateTensor(ptr, var->GetType()); CreateTensor(ptr, var->GetType());
......
...@@ -65,7 +65,7 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -65,7 +65,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR, PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
"The %d-th output of Output(%s) must be LoDTensor.", j, "The %d-th output of Output(%s) must be LoDTensor.", j,
out); out);
in_var->SetLoDLevel(out_var->GetLodLevel()); out_var->SetLoDLevel(in_var->GetLodLevel());
} }
bool IsRuntime() const override; bool IsRuntime() const override;
...@@ -466,7 +466,12 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const { ...@@ -466,7 +466,12 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name); auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
try { try {
auto shape = var->Shape();
if (shape.empty()) {
return framework::make_ddim({0UL});
} else {
return framework::make_ddim(var->Shape()); return framework::make_ddim(var->Shape());
}
} catch (...) { } catch (...) {
VLOG(5) << "GetDim of variable " << name << " error"; VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception()); std::rethrow_exception(std::current_exception());
......
...@@ -36,12 +36,9 @@ Scope& Scope::NewScope() const { ...@@ -36,12 +36,9 @@ Scope& Scope::NewScope() const {
} }
Variable* Scope::Var(const std::string& name) { Variable* Scope::Var(const std::string& name) {
auto iter = vars_.find(name); auto* v = FindVarLocally(name);
if (iter != vars_.end()) { if (v != nullptr) return v;
VLOG(3) << "Get existing variable " << name; v = new Variable();
return iter->second;
}
Variable* v = new Variable();
vars_[name] = v; vars_[name] = v;
VLOG(3) << "Create variable " << name; VLOG(3) << "Create variable " << name;
v->name_ = &(vars_.find(name)->first); v->name_ = &(vars_.find(name)->first);
...@@ -57,8 +54,10 @@ Variable* Scope::Var(std::string* name) { ...@@ -57,8 +54,10 @@ Variable* Scope::Var(std::string* name) {
} }
Variable* Scope::FindVar(const std::string& name) const { Variable* Scope::FindVar(const std::string& name) const {
auto it = vars_.find(name); auto var = FindVarLocally(name);
if (it != vars_.end()) return it->second; if (var != nullptr) {
return var;
}
return (parent_ == nullptr) ? nullptr : parent_->FindVar(name); return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
} }
...@@ -116,6 +115,11 @@ std::string Scope::Rename(const std::string& origin_name) const { ...@@ -116,6 +115,11 @@ std::string Scope::Rename(const std::string& origin_name) const {
Rename(origin_name, var_name); Rename(origin_name, var_name);
return var_name; return var_name;
} }
Variable* Scope::FindVarLocally(const std::string& name) const {
auto it = vars_.find(name);
if (it != vars_.end()) return it->second;
return nullptr;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -76,6 +76,8 @@ class Scope { ...@@ -76,6 +76,8 @@ class Scope {
std::string Rename(const std::string& origin_name) const; std::string Rename(const std::string& origin_name) const;
private: private:
Variable* FindVarLocally(const std::string& name) const;
// Call Scope::NewScope for a sub-scope. // Call Scope::NewScope for a sub-scope.
explicit Scope(Scope const* parent) : parent_(parent) {} explicit Scope(Scope const* parent) : parent_(parent) {}
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/framework/shape_inference.h" #include "paddle/framework/shape_inference.h"
#include "grad_op_desc_maker.h"
#include "paddle/framework/operator.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -22,6 +24,12 @@ std::vector<framework::DDim> InferShapeContext::GetInputsDim( ...@@ -22,6 +24,12 @@ std::vector<framework::DDim> InferShapeContext::GetInputsDim(
return GetDims(names); return GetDims(names);
} }
DDim InferShapeContext::GetInputsElementDim(const std::string &name,
int idx) const {
const std::vector<std::string> &names = Inputs(name);
return this->GetDim(names[idx]);
}
void InferShapeContext::SetOutputsDim( void InferShapeContext::SetOutputsDim(
const std::string &name, const std::vector<framework::DDim> &dims) { const std::string &name, const std::vector<framework::DDim> &dims) {
auto &names = Outputs(name); auto &names = Outputs(name);
...@@ -43,6 +51,9 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names, ...@@ -43,6 +51,9 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
size_t length = names.size(); size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size()); PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
if (names[i] == framework::kEmptyVarName) {
continue;
}
SetDim(names[i], dims[i]); SetDim(names[i], dims[i]);
} }
} }
......
...@@ -37,6 +37,7 @@ class InferShapeContext { ...@@ -37,6 +37,7 @@ class InferShapeContext {
virtual framework::DDim GetInputDim(const std::string &name) const = 0; virtual framework::DDim GetInputDim(const std::string &name) const = 0;
std::vector<framework::DDim> GetInputsDim(const std::string &name) const; std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
DDim GetInputsElementDim(const std::string &name, int idx) const;
virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0; virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
void SetOutputsDim(const std::string &name, void SetOutputsDim(const std::string &name,
......
...@@ -21,7 +21,7 @@ template <class T> ...@@ -21,7 +21,7 @@ template <class T>
struct EigenBlasGemm { struct EigenBlasGemm {
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>, typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
Eigen::Aligned> Eigen::Aligned>
Matrix; EigenMatrix;
static void compute(const bool transA, static void compute(const bool transA,
const bool transB, const bool transB,
...@@ -56,14 +56,13 @@ struct EigenBlasGemm { ...@@ -56,14 +56,13 @@ struct EigenBlasGemm {
sizeB[1] = N; sizeB[1] = N;
CHECK_EQ(N, ldb); CHECK_EQ(N, ldb);
} }
Eigen::array<int, 2> sizeC; Eigen::array<int, 2> sizeC = {{M, ldc}};
sizeC[0] = M; Eigen::array<int, 2> offsetC = {{0, 0}};
sizeC[1] = N; Eigen::array<int, 2> extentC = {{M, N}};
CHECK_EQ(N, ldc);
const Matrix a(const_cast<T*>(A), sizeA); const EigenMatrix a(const_cast<T*>(A), sizeA);
const Matrix b(const_cast<T*>(B), sizeB); const EigenMatrix b(const_cast<T*>(B), sizeB);
Matrix c(C, sizeC); EigenMatrix c(C, sizeC);
typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair; typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims; Eigen::array<DimPair, 1> dims;
...@@ -72,6 +71,7 @@ struct EigenBlasGemm { ...@@ -72,6 +71,7 @@ struct EigenBlasGemm {
dims[0].second = transB ? 1 : 0; dims[0].second = transB ? 1 : 0;
Eigen::DefaultDevice device; Eigen::DefaultDevice device;
if (N == ldc) {
if (alpha == T(1) && beta == T(0)) { if (alpha == T(1) && beta == T(0)) {
c.device(device) = a.contract(b, dims); c.device(device) = a.contract(b, dims);
} else if (alpha == T(1) && beta == T(1)) { } else if (alpha == T(1) && beta == T(1)) {
...@@ -79,6 +79,16 @@ struct EigenBlasGemm { ...@@ -79,6 +79,16 @@ struct EigenBlasGemm {
} else { } else {
c.device(device) = alpha * a.contract(b, dims) + beta * c; c.device(device) = alpha * a.contract(b, dims) + beta * c;
} }
} else {
if (alpha == T(1) && beta == T(0)) {
c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
} else if (alpha == T(1) && beta == T(1)) {
c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
} else {
c.slice(offsetC, extentC).device(device) =
alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
}
}
} }
}; };
......
...@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) { ...@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
batchSize, batchSize,
codeLength_, codeLength_,
/* trans */ false, /* trans */ false,
useGpu(deviceId_)); false);
Matrix::resizeOrCreate(preOutput_.grad, Matrix::resizeOrCreate(preOutput_.grad,
batchSize, batchSize,
codeLength_, codeLength_,
/* trans */ false, /* trans */ false,
useGpu(deviceId_)); false);
IVectorPtr label = getInput(*getLabelLayer()).ids; IVectorPtr label = getInput(*getLabelLayer()).ids;
preOutput_.value->zeroMem(); preOutput_.value->zeroMem();
if (useGpu_) {
Matrix::resizeOrCreate(cpuOutput_,
output_.value->getHeight(),
output_.value->getWidth(),
/* trans */ false,
false);
IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
cpuLabel_->copyFrom(*label);
cpuOutput_->copyFrom(*output_.value);
} else {
cpuOutput_ = output_.value;
cpuLabel_ = label;
}
/* add the bias-vector */ /* add the bias-vector */
if (biases_.get() != NULL) { if (biases_.get() != NULL) {
preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW()); if (useGpu_) {
Matrix::resizeOrCreate(cpuBias_,
1,
numClasses_ - 1,
/* trans */ false,
false);
cpuBias_->copyFrom(*biases_->getW());
} else {
cpuBias_ = biases_->getW();
}
preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
} }
for (size_t i = 0; i < inputLayers_.size() - 1; ++i) { for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
MatrixPtr input = getInputValue(i); MatrixPtr input = getInputValue(i);
if (useGpu_) {
Matrix::resizeOrCreate(cpuInput_,
input->getHeight(),
input->getWidth(),
/* trans */ false,
false);
Matrix::resizeOrCreate(cpuWeight_,
weights_[i]->getW()->getHeight(),
weights_[i]->getW()->getWidth(),
/* trans */ false,
false);
cpuInput_->copyFrom(*input);
cpuWeight_->copyFrom(*weights_[i]->getW());
} else {
cpuInput_ = input;
cpuWeight_ = weights_[i]->getW();
}
preOutput_.value->mulByBitCode( preOutput_.value->mulByBitCode(
numClasses_, *label, *weights_[i]->getW(), *input); numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
} }
// keep consistent with the clipping in the following softrelu // keep consistent with the clipping in the following softrelu
preOutput_.value->clip(-40.0, 40.0); preOutput_.value->clip(-40.0, 40.0);
preOutput_.value->sumByBitCode(numClasses_, preOutput_.value->sumByBitCode(numClasses_,
*label, *cpuLabel_,
*output_.value, *cpuOutput_,
-1); // scaleSum -1); // scaleSum
preOutput_.value->softrelu(*preOutput_.value); preOutput_.value->softrelu(*preOutput_.value);
MatrixPtr sum = MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
preOutput_.value->rowSum(*sum); preOutput_.value->rowSum(*sum);
output_.value->add(*sum); cpuOutput_->add(*sum);
if (useGpu_) {
output_.value->copyFrom(*cpuOutput_);
} else {
output_.value = cpuOutput_;
}
} }
void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
IVectorPtr label = getInput(*getLabelLayer()).ids; IVectorPtr label = getInput(*getLabelLayer()).ids;
if (useGpu_) {
IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
cpuLabel_->copyFrom(*label);
} else {
cpuLabel_ = label;
}
preOutput_.grad->one(); preOutput_.grad->one();
preOutput_.grad->softreluDerivative(*preOutput_.value); preOutput_.grad->softreluDerivative(*preOutput_.value);
preOutput_.grad->subByBitCode(numClasses_, *label); preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
if (biases_ && biases_->getWGrad()) { if (biases_ && biases_->getWGrad()) {
preOutput_.grad->addByBitCodeBackward( MatrixPtr biases_grad = biases_->getWGrad();
numClasses_, *label, *biases_->getWGrad()); if (useGpu_) {
Matrix::resizeOrCreate(cpuBias_,
1,
numClasses_ - 1,
/* trans */ false,
false);
cpuBias_->copyFrom(*biases_grad);
} else {
cpuBias_ = biases_grad;
}
preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
if (useGpu_) {
biases_grad->copyFrom(*cpuBias_);
} else {
biases_grad = cpuBias_;
}
/* Increasing the number of gradient */ /* Increasing the number of gradient */
biases_->getParameterPtr()->incUpdate(callback); biases_->getParameterPtr()->incUpdate(callback);
} }
...@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { ...@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
/* Calculate the W-gradient for the current layer */ /* Calculate the W-gradient for the current layer */
MatrixPtr input = getInputValue(i); MatrixPtr input = getInputValue(i);
if (weights_[i]->getWGrad()) { if (weights_[i]->getWGrad()) {
MatrixPtr weights_grad = weights_[i]->getWGrad();
if (useGpu_) {
Matrix::resizeOrCreate(cpuInput_,
input->getHeight(),
input->getWidth(),
/* trans */ false,
false);
Matrix::resizeOrCreate(cpuWeightGrad_,
weights_grad->getHeight(),
weights_grad->getWidth(),
/* trans */ false,
false);
cpuInput_->copyFrom(*input);
cpuWeightGrad_->copyFrom(*weights_grad);
} else {
cpuInput_ = input;
cpuWeightGrad_ = weights_grad;
}
preOutput_.grad->mulByBitCodeBackwardWeight( preOutput_.grad->mulByBitCodeBackwardWeight(
numClasses_, *label, *weights_[i]->getWGrad(), *input); numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
if (useGpu_) {
weights_grad->copyFrom(*cpuWeightGrad_);
} else {
weights_grad = cpuWeightGrad_;
}
/* Increasing the number of gradient */ /* Increasing the number of gradient */
weights_[i]->getParameterPtr()->incUpdate(callback); weights_[i]->getParameterPtr()->incUpdate(callback);
} }
...@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { ...@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
/* Calculate the input layers error */ /* Calculate the input layers error */
MatrixPtr inputGrad = getInputGrad(i); MatrixPtr inputGrad = getInputGrad(i);
if (inputGrad) { if (inputGrad) {
if (useGpu_) {
Matrix::resizeOrCreate(cpuInputGrad_,
inputGrad->getHeight(),
inputGrad->getWidth(),
/* trans */ false,
false);
Matrix::resizeOrCreate(cpuWeight_,
weights_[i]->getW()->getHeight(),
weights_[i]->getW()->getWidth(),
/* trans */ false,
false);
cpuInputGrad_->copyFrom(*inputGrad);
cpuWeight_->copyFrom(*weights_[i]->getW());
} else {
cpuInputGrad_ = inputGrad;
cpuWeight_ = weights_[i]->getW();
}
preOutput_.grad->mulByBitCodeBackwardError( preOutput_.grad->mulByBitCodeBackwardError(
numClasses_, *label, *weights_[i]->getW(), *inputGrad); numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
if (useGpu_) {
inputGrad->copyFrom(*cpuInputGrad_);
} else {
inputGrad = cpuInputGrad_;
}
} }
} }
} }
......
...@@ -80,6 +80,15 @@ protected: ...@@ -80,6 +80,15 @@ protected:
int codeLength_; int codeLength_;
/// temporary result of output_ /// temporary result of output_
Argument preOutput_; Argument preOutput_;
/// The temporary variables in CPU memory.
MatrixPtr cpuWeight_;
MatrixPtr cpuWeightGrad_;
MatrixPtr cpuInput_;
MatrixPtr cpuInputGrad_;
MatrixPtr cpuBias_;
MatrixPtr cpuOutput_;
IVectorPtr cpuLabel_;
}; };
} // namespace paddle } // namespace paddle
# gserver pacakge unittests # gserver pacakge unittests
add_simple_unittest(test_LinearChainCRF) add_simple_unittest(test_LinearChainCRF)
add_simple_unittest(test_RecurrentLayer) add_simple_unittest(test_RecurrentLayer)
...@@ -29,6 +28,26 @@ gserver_test(test_KmaxSeqScore) ...@@ -29,6 +28,26 @@ gserver_test(test_KmaxSeqScore)
gserver_test(test_Expand) gserver_test(test_Expand)
gserver_test(test_MaxPoolingWithMaskOutput) gserver_test(test_MaxPoolingWithMaskOutput)
set(PYTHON_PATH
${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
function(gserver_test_with_python TARGET)
add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
add_test(NAME ${TARGET}
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endfunction()
gserver_test_with_python(test_PyDataProvider2)
if(WITH_PYTHON)
gserver_test_with_python(test_PyDataProvider)
endif()
if(NOT MOBILE_INFERENCE)
gserver_test_with_python(test_CompareTwoNets)
# TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
gserver_test_with_python(test_RecurrentGradientMachine)
endif()
########## test_MKLDNN layers and activations ########## ########## test_MKLDNN layers and activations ##########
if(WITH_MKLDNN) if(WITH_MKLDNN)
add_unittest_without_exec(test_MKLDNN add_unittest_without_exec(test_MKLDNN
...@@ -36,18 +55,7 @@ if(WITH_MKLDNN) ...@@ -36,18 +55,7 @@ if(WITH_MKLDNN)
MKLDNNTester.cpp MKLDNNTester.cpp
LayerGradUtil.cpp) LayerGradUtil.cpp)
add_test(NAME test_MKLDNN add_test(NAME test_MKLDNN
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif()
############## test_PyDataProvider ########################
if(WITH_PYTHON)
add_unittest_without_exec(test_PyDataProvider
test_PyDataProvider.cpp)
add_test(NAME test_PyDataProvider
COMMAND .set_python_path.sh -d ./gserver/tests:${PADDLE_SOURCE_DIR}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif() endif()
...@@ -55,68 +63,35 @@ endif() ...@@ -55,68 +63,35 @@ endif()
if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE) if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
add_unittest_without_exec(test_WarpCTCLayer add_unittest_without_exec(test_WarpCTCLayer
test_WarpCTCLayer.cpp) test_WarpCTCLayer.cpp)
add_test(NAME test_WarpCTCLayer add_test(NAME test_WarpCTCLayer
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif() endif()
if(NOT MOBILE_INFERENCE) if(NOT MOBILE_INFERENCE)
################## test_Evaluator ####################### ################## test_Evaluator #############
add_unittest(test_Evaluator add_unittest(test_Evaluator
test_Evaluator.cpp) test_Evaluator.cpp)
############### test_RecurrentGradientMachine ############### ########### test_NetworkCompare ###############
# TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
# I will fix it.
add_unittest_without_exec(test_RecurrentGradientMachine
test_RecurrentGradientMachine.cpp)
add_test(NAME test_RecurrentGradientMachine
COMMAND .set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
############### test_NetworkCompare ###############
add_unittest_without_exec(test_NetworkCompare add_unittest_without_exec(test_NetworkCompare
test_NetworkCompare.cpp) test_NetworkCompare.cpp)
if(WITH_GPU) if(WITH_GPU)
add_test(NAME test_NetworkCompare set(use_gpu true)
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
else() else()
set(use_gpu false)
endif()
add_test(NAME test_NetworkCompare add_test(NAME test_NetworkCompare
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif()
endif()
add_unittest_without_exec(test_PyDataProvider2
test_PyDataProvider2.cpp)
add_test(NAME test_PyDataProvider2
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
)
################# test_CompareSparse ################## ############ test_CompareSparse ################
add_unittest_without_exec(test_CompareSparse add_unittest_without_exec(test_CompareSparse
test_CompareSparse.cpp) test_CompareSparse.cpp)
if(NOT ON_TRAVIS) if(NOT ON_TRAVIS)
add_test(NAME test_CompareSparse add_test(NAME test_CompareSparse
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
./.set_port.sh -p port -n 6
${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif()
endif() endif()
################ test_CompareTwoNets ######################
add_unittest_without_exec(test_CompareTwoNets
test_CompareTwoNets.cpp)
add_test(NAME test_CompareTwoNets
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
...@@ -41,7 +41,7 @@ nonseq = embedding_layer(input=label, size=word_dim) ...@@ -41,7 +41,7 @@ nonseq = embedding_layer(input=label, size=word_dim)
# This hierarchical RNN is designed to be equivalent to the simple RNN in # This hierarchical RNN is designed to be equivalent to the simple RNN in
# sequence_rnn_multi_unequalength_inputs.conf # sequence_rnn_mixed_inputs.conf
def outer_step(subseq, seq, nonseq, encoding): def outer_step(subseq, seq, nonseq, encoding):
outer_mem = memory(name="outer_rnn_state", size=hidden_dim) outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
......
...@@ -37,7 +37,7 @@ encoding = embedding_layer(input=data2, size=word_dim) ...@@ -37,7 +37,7 @@ encoding = embedding_layer(input=data2, size=word_dim)
# This hierarchical RNN is designed to be equivalent to the simple RNN in # This hierarchical RNN is designed to be equivalent to the simple RNN in
# sequence_rnn_multi_unequalength_inputs.conf # sequence_rnn_matched_inputs.conf
def outer_step(subseq, seq, nonseq, encoding): def outer_step(subseq, seq, nonseq, encoding):
outer_mem = memory(name="outer_rnn_state", size=hidden_dim) outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
......
...@@ -681,12 +681,13 @@ TEST(Layer, hsigmoidLayer) { ...@@ -681,12 +681,13 @@ TEST(Layer, hsigmoidLayer) {
config.layerConfig.add_inputs(); config.layerConfig.add_inputs();
config.layerConfig.add_inputs(); config.layerConfig.add_inputs();
// Not support GPU now for (auto useGpu : {false, true}) {
testLayerGrad(config, testLayerGrad(config,
"hsigmoid", "hsigmoid",
100, 100,
/* trans */ false, /* useGpu */ /* trans */ false,
false); /* useGpu */ useGpu);
}
} }
TEST(Layer, multi_cross) { TEST(Layer, multi_cross) {
......
...@@ -26,8 +26,6 @@ else() ...@@ -26,8 +26,6 @@ else()
endif() endif()
if(MOBILE_INFERENCE) if(MOBILE_INFERENCE)
list(REMOVE_ITEM MATH_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/SIMDFunctions.cpp)
# Remove sparse # Remove sparse
list(REMOVE_ITEM MATH_HEADERS list(REMOVE_ITEM MATH_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
......
...@@ -116,9 +116,11 @@ inline bool vec_check(size_t len) { ...@@ -116,9 +116,11 @@ inline bool vec_check(size_t len) {
} }
namespace internal { namespace internal {
#ifdef __SSE3__
void addToImpl(float* a, const float* b, size_t len); void addToImpl(float* a, const float* b, size_t len);
void batchAddToImpl(float* a, const float* b[], int batch, size_t len); void batchAddToImpl(float* a, const float* b[], int batch, size_t len);
void colMaxImpl(float* result, const float* data, int dim, int numSamples); void colMaxImpl(float* result, const float* data, int dim, int numSamples);
#endif
#ifdef __AVX__ #ifdef __AVX__
void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len); void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len);
void decayL1AvxImpl( void decayL1AvxImpl(
......
...@@ -81,18 +81,33 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { ...@@ -81,18 +81,33 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
} }
template <> template <>
void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) { size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
return GetGPUBuddyAllocator(place.device)->Alloc(size); return GetGPUBuddyAllocator(place.device)->Used();
} }
template <> template <>
void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) { void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
GetGPUBuddyAllocator(place.device)->Free(p); auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
auto* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device);
size_t avail, total;
platform::GpuMemoryUsage(avail, total);
LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
<< place.device << ", available " << avail << " bytes";
LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
LOG(WARNING) << "GPU memory used: " << Used<platform::GPUPlace>(place);
platform::SetDeviceId(cur_dev);
}
return ptr;
} }
template <> template <>
size_t Used<platform::GPUPlace>(platform::GPUPlace place) { void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
return GetGPUBuddyAllocator(place.device)->Used(); GetGPUBuddyAllocator(place.device)->Free(p);
} }
#endif #endif
......
...@@ -191,6 +191,7 @@ set(DEPS_OPS ...@@ -191,6 +191,7 @@ set(DEPS_OPS
sum_op sum_op
pool_op pool_op
maxout_op maxout_op
unpool_op
pool_with_index_op pool_with_index_op
conv_op conv_op
conv_transpose_op conv_transpose_op
...@@ -211,18 +212,22 @@ set(DEPS_OPS ...@@ -211,18 +212,22 @@ set(DEPS_OPS
send_op send_op
recv_op) recv_op)
if(WITH_DISTRIBUTE)
add_subdirectory(detail) add_subdirectory(detail)
op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
set_source_files_properties( set_source_files_properties(
send_op.cc send_op.cc
PROPERTIES PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
set_source_files_properties( set_source_files_properties(
recv_op.cc recv_op.cc
PROPERTIES PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
endif()
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
op_library(cross_entropy_op DEPS cross_entropy) op_library(cross_entropy_op DEPS cross_entropy)
...@@ -235,6 +240,7 @@ op_library(adagrad_op DEPS selected_rows_functor) ...@@ -235,6 +240,7 @@ op_library(adagrad_op DEPS selected_rows_functor)
op_library(conv_op DEPS vol2col) op_library(conv_op DEPS vol2col)
op_library(pool_op DEPS pooling) op_library(pool_op DEPS pooling)
op_library(maxout_op DEPS maxouting) op_library(maxout_op DEPS maxouting)
op_library(unpool_op DEPS unpooling)
op_library(pool_with_index_op DEPS pooling) op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op) op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
...@@ -273,4 +279,3 @@ if(WITH_GPU) ...@@ -273,4 +279,3 @@ if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif() endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
...@@ -62,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -62,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel {
const auto x_dims = ctx->GetInputDim("X"); const auto x_dims = ctx->GetInputDim("X");
const TensorFormat tensor_format = const TensorFormat tensor_format =
StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format")); StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"Input X must have 2 to 5 dimensions.");
const int C = const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1] (tensor_format == TensorFormat::NCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]); : x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
"Input X must have 3 to 5 dimensions.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
...@@ -146,8 +147,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> { ...@@ -146,8 +147,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
const auto *x = ctx.Input<Tensor>("X"); const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims(); const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 3 and 5"); "The Input dim size should be between 2 and 5");
const int N = x_dims[0]; const int N = x_dims[0];
const int C = const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1] (tensor_format == TensorFormat::NCHW ? x_dims[1]
...@@ -339,8 +340,8 @@ class BatchNormGradKernel<platform::CPUPlace, T> ...@@ -339,8 +340,8 @@ class BatchNormGradKernel<platform::CPUPlace, T>
// Get the size for each dimension. // Get the size for each dimension.
// NCHW [batch_size, in_channels, in_height, in_width] // NCHW [batch_size, in_channels, in_height, in_width]
const auto &x_dims = x->dims(); const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 3 and 5"); "The Input dim size should be between 2 and 5");
const int N = x_dims[0]; const int N = x_dims[0];
const int C = const int C =
(tensor_format == TensorFormat::NCHW ? x_dims[1] (tensor_format == TensorFormat::NCHW ? x_dims[1]
......
...@@ -29,6 +29,12 @@ void ExtractNCWHD(const framework::DDim &dims, ...@@ -29,6 +29,12 @@ void ExtractNCWHD(const framework::DDim &dims,
const TensorFormat &tensor_format, int *N, int *C, int *H, const TensorFormat &tensor_format, int *N, int *C, int *H,
int *W, int *D) { int *W, int *D) {
*N = dims[0]; *N = dims[0];
if (dims.size() == 2) {
*C = dims[1];
*H = 1;
*W = 1;
*D = 1;
} else {
*C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1]; *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
*H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1]; *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
*W = dims.size() > 3 *W = dims.size() > 3
...@@ -37,6 +43,7 @@ void ExtractNCWHD(const framework::DDim &dims, ...@@ -37,6 +43,7 @@ void ExtractNCWHD(const framework::DDim &dims,
*D = dims.size() > 4 *D = dims.size() > 4
? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3]) ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
: 1; : 1;
}
} }
template <typename T> template <typename T>
...@@ -56,8 +63,8 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> { ...@@ -56,8 +63,8 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
// NCHW [batch_size, in_channels, in_height, in_width] // NCHW [batch_size, in_channels, in_height, in_width]
const auto *x = ctx.Input<Tensor>("X"); const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims(); const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 3 and 5"); "The Input dim size should be between 2 and 5");
int N, C, H, W, D; int N, C, H, W, D;
ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D); ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
...@@ -180,8 +187,8 @@ class BatchNormGradKernel<platform::GPUPlace, T> ...@@ -180,8 +187,8 @@ class BatchNormGradKernel<platform::GPUPlace, T>
const auto &x_dims = x->dims(); const auto &x_dims = x->dims();
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"The Input dim size should be between 3 and 5"); "The Input dim size should be between 2 and 5");
int N, C, H, W, D; int N, C, H, W, D;
ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D); ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
......
...@@ -25,7 +25,7 @@ class ConcatOp : public framework::OperatorWithKernel { ...@@ -25,7 +25,7 @@ class ConcatOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
"Inputs(X) of ConcatOp should be empty.") "Inputs(X) of ConcatOp should be empty.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ConcatOp should not be null."); "Output(Out) of ConcatOp should not be null.");
...@@ -45,7 +45,7 @@ class ConcatOp : public framework::OperatorWithKernel { ...@@ -45,7 +45,7 @@ class ConcatOp : public framework::OperatorWithKernel {
} }
PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
"Input tensors should have the same " "Input tensors should have the same "
"elements except the specify axis.") "elements except the specify axis.");
} }
} }
ctx->SetOutputDim("Out", out_dims); ctx->SetOutputDim("Out", out_dims);
......
...@@ -63,7 +63,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> { ...@@ -63,7 +63,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t cudnn_conv_desc = cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations); conv_desc.descriptor<T>(paddings, strides, dilations);
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it mannually // cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups // FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1. // rather than setting it to 1.
...@@ -180,7 +180,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { ...@@ -180,7 +180,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
cudnnConvolutionDescriptor_t cudnn_conv_desc = cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations); conv_desc.descriptor<T>(paddings, strides, dilations);
#if CUDNN_VERSION_MIN(7, 0, 0) #if CUDNN_VERSION_MIN(7, 0, 1)
// cudnn 7 can support groups, no need to do it mannually // cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups // FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1. // rather than setting it to 1.
......
...@@ -97,7 +97,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, ...@@ -97,7 +97,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
.SetDefault({0, 0}); .SetDefault({0, 0});
AddAttr<int>( AddAttr<int>(
"groups", "groups",
"(int default:1), the group size of convolution operator. " "(int default:1), the groups number of the convolution operator. "
"According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
"when group=2, the first half of the filters is only connected to the " "when group=2, the first half of the filters is only connected to the "
"first half of the input channels, while the second half of the filters " "first half of the input channels, while the second half of the filters "
...@@ -112,23 +112,29 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, ...@@ -112,23 +112,29 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
Convolution Operator. Convolution Operator.
The convolution operation calculates the output based on the input, filter The convolution operation calculates the output based on the input, filter
and strides, paddings, groups, dilations parameters. The size of each dimension of the and strides, paddings, dilations, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape. parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch Input(Input) and Output(Output) are in NCHW format. Where N is batch
size, C is the number of channels, H is the height of the feature, and W is size, C is the number of channels, H is the height of the feature, and W is
the width of the feature. Parameters(ksize, strides, paddings, dilations) are two elements. the width of the feature.
These two elements represent height and width, respectively. Filters(Input) is MCHW format. Where M is the number of output image channels, C is
the number of input image channels, H is the height of the filter, and W
is the width of the filter.
Parameters(strides, paddings, dilations) are two elements. These two elements represent
height and width, respectively.
The input(X) size and output(Out) size may be different. The input(X) size and output(Out) size may be different.
Example: Example:
Input: Input:
Input shape: (N, C_in, H_in, W_in) Input shape: $(N, C_{in}, H_{in}, W_{in})$
Filter shape: (C_out, C_in, H_f, W_f) Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
Output: Output:
Output shape: (N, C_out, H_out, W_out) Output shape: $(N, C_{out}, H_{out}, W_{out})$
where Where
H_out = (H_in + 2 * paddings[0] - (dilations[0]*(filter_size[0] - 1) + 1)) / strides[0] + 1; $$
W_out = (W_in + 2 * paddings[1] - (dilations[1]*(filter_size[1] - 1) + 1)) / strides[1] + 1; H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
$$
)DOC"); )DOC");
} }
...@@ -165,7 +171,7 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto, ...@@ -165,7 +171,7 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
.SetDefault({0, 0, 0}); .SetDefault({0, 0, 0});
AddAttr<int>( AddAttr<int>(
"groups", "groups",
"(int default:1), the group size of convolution operator. " "(int default:1), the groups number of the convolution operator. "
"According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
"when group=2, the first half of the filters is only connected to the " "when group=2, the first half of the filters is only connected to the "
"first half of the input channels, while the second half of the filters " "first half of the input channels, while the second half of the filters "
...@@ -174,32 +180,37 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto, ...@@ -174,32 +180,37 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
AddAttr<std::vector<int>>("dilations", AddAttr<std::vector<int>>("dilations",
"(vector<int> default:{1, 1, 1}), the " "(vector<int> default:{1, 1, 1}), the "
"dilations(d_dilation, h_dilation, w_dilation) of " "dilations(d_dilation, h_dilation, w_dilation) of "
"convolution operator. Currently, conv3d doesn't " "convolution operator.")
"support dilation.")
.SetDefault({1, 1, 1}); .SetDefault({1, 1, 1});
AddComment(R"DOC( AddComment(R"DOC(
Convolution3D Operator. Convolution3D Operator.
The convolution operation calculates the output based on the input, filter The convolution operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the and strides, paddings, dilations, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape. parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch Input(Input) and output(Output) are in NCDHW format, where N is batch
size, C is the number of channels,D is the depth of the feature, H is the height of size, C is the number of channels,D is the depth of the feature, H is the height of
the feature, and W is the width of the feature. Parameters(ksize, strides, paddings) the feature, and W is the width of the feature.
are three elements. These three elements represent depth, height and width, respectively. Filters(Input) is MCDHW format, where M is the number of output image channels,
C is the number of input image channels, D is the depth of the filter,
H is the height of the filter, and W is the width of the filter.
Parameters(strides, paddings, dilations) are three elements. These three elements
represent depth, height and width, respectively.
The input(X) size and output(Out) size may be different. The input(X) size and output(Out) size may be different.
Example: Example:
Input: Input:
Input shape: (N, C_in, D_in, H_in, W_in) Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
Filter shape: (C_out, C_in, D_f, H_f, W_f) Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$
Output: Output:
Output shape: (N, C_out, D_out, H_out, W_out) Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
where Where
D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1; $$
H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1; D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\
W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1; H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\
W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
$$
)DOC"); )DOC");
} }
......
...@@ -39,7 +39,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -39,7 +39,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
"ConvTransposeOp input dimension and strides dimension should " "ConvTransposeOp input dimension and strides dimension should "
"be consistent."); "be consistent.");
PADDLE_ENFORCE_EQ(paddings.size(), strides.size(), PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
"ConvTransposeOp paddings dimension and Conv strides " "ConvTransposeOp paddings dimension and strides "
"dimension should be the same."); "dimension should be the same.");
PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0], PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
"In ConvTransposeOp, The input channel should be the same " "In ConvTransposeOp, The input channel should be the same "
...@@ -62,24 +62,25 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker( ...@@ -62,24 +62,25 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
"The format of input tensor is NCHW. Where N is batch size, C is the " "The format of input tensor is NCHW. Where N is batch size, C is the "
"number of input channels, H is the height of the feature, and " "number of input channels, H is the height of the feature, and "
"W is the width of the feature."); "W is the width of the feature.");
AddInput("Filter", AddInput(
"Filter",
"(Tensor) The filter tensor of convolution transpose operator. " "(Tensor) The filter tensor of convolution transpose operator. "
"The format of the filter tensor is CMHW, where C is the number of " "The format of the filter tensor is MCHW, where M is the number of "
"output image channels, M is the number of input image channels, " "input feature channels, C is the number of "
"output feature channels,"
"H is the height of the filter, and W is the width of the filter. " "H is the height of the filter, and W is the width of the filter. "
"We enforce groups number == 1 and padding == 0 in " "We enforce groups number == 1 in the convolution transpose scenario.");
"the convolution transpose scenario.");
AddOutput("Output", AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator. " "(Tensor) The output tensor of convolution transpose operator. "
"The format of output tensor is also NCHW."); "The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"strides", "strides",
"(vector<int> defalut:{1, 1}), the strides(h_stride, w_stride) of " "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
"convolution transpose operator.") "convolution transpose operator.")
.SetDefault({1, 1}); .SetDefault({1, 1});
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"paddings", "paddings",
"(vector<int> defalut:{0, 0}), the paddings(h_pad, w_pad) of convolution " "(vector<int> default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
"transpose operator.") "transpose operator.")
.SetDefault({0, 0}); .SetDefault({0, 0});
AddComment(R"DOC( AddComment(R"DOC(
...@@ -88,21 +89,26 @@ Convolution2D Transpose Operator. ...@@ -88,21 +89,26 @@ Convolution2D Transpose Operator.
The convolution transpose operation calculates the output based on the input, filter The convolution transpose operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape. parameters is checked in the infer-shape.
Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch number of channels, H is the height of the feature, and W is the width of the feature.
size, C is the number of channels, H is the height of the feature, and Filter(Input) is in MCHW format. Where M is the number of input feature channels,
W is the width of the feature. Parameters(ksize, strides, paddings) are two elements. C is the number of output feature channels, H is the height of the filter,
These two elements represent height and width, respectively. and W is the width of the filter.
Parameters(strides, paddings) are two elements. These two elements represent height
and width, respectively.
The input(X) size and output(Out) size may be different. The input(X) size and output(Out) size may be different.
Example: Example:
Input: Input:
Input shape: (N, C_in, H_in, W_in) Input shape: $(N, C_{in}, H_{in}, W_{in})$
Filter shape: (C_in, C_out, H_f, W_f) Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
Output: Output:
Output shape: (N, C_out, H_out, W_out) Output shape: $(N, C_{out}, H_{out}, W_{out})$
where Where
H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0]; $$
W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1]; H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + H_f \\
W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + W_f
$$
)DOC"); )DOC");
} }
...@@ -117,8 +123,9 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker( ...@@ -117,8 +123,9 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
"W is the width of the feature."); "W is the width of the feature.");
AddInput("Filter", AddInput("Filter",
"(Tensor) The filter tensor of convolution transpose operator." "(Tensor) The filter tensor of convolution transpose operator."
"The format of the filter tensor is CMDHW, where C is the number of " "The format of the filter tensor is MCDHW, where M is the number of "
"output image channels, M is the number of input image channels, D " "input feature channels, C is the number of "
"output feature channels, D "
"is the depth of the filter, H is the height of the filter, and " "is the depth of the filter, H is the height of the filter, and "
"W is the width of the filter." "W is the width of the filter."
"We enforce groups number == 1 and padding == 0 in " "We enforce groups number == 1 and padding == 0 in "
...@@ -130,12 +137,12 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker( ...@@ -130,12 +137,12 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
"the number of channels, D is the depth of the feature, H is the " "the number of channels, D is the depth of the feature, H is the "
"height of the feature, and W is the width of the feature."); "height of the feature, and W is the width of the feature.");
AddAttr<std::vector<int>>("strides", AddAttr<std::vector<int>>("strides",
"(vector<int> defalut:{1, 1, 1}), the " "(vector<int> default:{1, 1, 1}), the "
"strides{d_stride, h_stride, w_stride} of " "strides{d_stride, h_stride, w_stride} of "
"convolution transpose operator.") "convolution transpose operator.")
.SetDefault({1, 1, 1}); .SetDefault({1, 1, 1});
AddAttr<std::vector<int>>("paddings", AddAttr<std::vector<int>>("paddings",
"(vector<int> defalut:{0, 0, 0}), paddings(d_pad, " "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
"h_pad, w_pad) of convolution transpose operator.") "h_pad, w_pad) of convolution transpose operator.")
.SetDefault({0, 0, 0}); .SetDefault({0, 0, 0});
AddComment(R"DOC( AddComment(R"DOC(
...@@ -144,23 +151,28 @@ Convolution3D Transpose Operator. ...@@ -144,23 +151,28 @@ Convolution3D Transpose Operator.
The convolution transpose operation calculates the output based on the input, filter The convolution transpose operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape. parameters is checked in the infer-shape.
Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch number of channels, D is the depth of the feature, H is the height of the feature,
size, C is the number of channels, D is the depth of the feature, and W is the width of the feature.
H is the height of the feature, and W is the width of the feature. Filter(Input) is in MCDHW format. Where M is the number of input feature channels,
Parameters(ksize, strides, paddings) are three elements. C is the number of output feature channels, D is the depth of the filter,H is the
These three elements represent depth, height and width, respectively. height of the filter, and W is the width of the filter.
Parameters(strides, paddings) are three elements. These three elements represent
depth, height and width, respectively.
The input(X) size and output(Out) size may be different. The input(X) size and output(Out) size may be different.
Example: Example:
Input: Input:
Input shape: (N, C_in, D_in, H_in, W_in) Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
Filter shape: (C_in, C_out, D_f, H_f, W_f) Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
Output: Output:
Output shape: (N, C_out, D_out, H_out, W_out) Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
where Where
D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0]; $$
H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1]; D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + D_f \\
W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2]; H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + H_f \\
W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + W_f
$$
)DOC"); )DOC");
} }
......
...@@ -63,7 +63,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> { ...@@ -63,7 +63,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
// TODO(Zhuoyuan): Paddings can be added in future.
// groups will alway be disabled in conv2dtranspose. // groups will alway be disabled in conv2dtranspose.
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
......
...@@ -32,6 +32,4 @@ message VariableMessage { ...@@ -32,6 +32,4 @@ message VariableMessage {
bytes serialized = 2; bytes serialized = 2;
} }
message VoidMessage { message VoidMessage {}
}
\ No newline at end of file
...@@ -19,11 +19,48 @@ ...@@ -19,11 +19,48 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
struct AddFunctor {
HOSTDEVICE T operator()(T a, T b) const { return a + b; }
};
template <typename Place, typename T> template <typename Place, typename T>
class ElementwiseAddKernel : public framework::OpKernel<T> { class ElementwiseAddKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseCompute<EigenAddFunctor, Place, T>(ctx); using Tensor = framework::Tensor;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
z->mutable_data<T>(ctx.GetPlace());
TransformFunctor<AddFunctor<T>, T, Place> functor(
x, y, z, ctx.device_context(), AddFunctor<T>());
auto x_dims = x->dims();
auto y_dims = y->dims();
PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
"Rank of first input must >= rank of second input.");
if (x_dims == y_dims) {
functor.Run();
return;
}
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)");
int pre, n, post;
get_mid_dims(x_dims, y_dims, axis, pre, n, post);
if (post == 1) {
functor.RunRowWise(n, pre);
return;
} else {
functor.RunMidWise(n, pre, post);
return;
}
} }
}; };
......
...@@ -35,7 +35,7 @@ class ElementwiseOp : public framework::OperatorWithKernel { ...@@ -35,7 +35,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
auto x_dim = ctx->GetInputDim("X"); auto x_dim = ctx->GetInputDim("X");
auto y_dim = ctx->GetInputDim("Y"); auto y_dim = ctx->GetInputDim("Y");
PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
"Rank of first input must >= rank of second input.") "Rank of first input must >= rank of second input.");
ctx->SetOutputDim("Out", x_dim); ctx->SetOutputDim("Out", x_dim);
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
...@@ -120,7 +120,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { ...@@ -120,7 +120,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
"Rank of first input must >= rank of second input.") "Rank of first input must >= rank of second input.");
auto x_grad_name = framework::GradVarName("X"); auto x_grad_name = framework::GradVarName("X");
auto y_grad_name = framework::GradVarName("Y"); auto y_grad_name = framework::GradVarName("Y");
......
...@@ -16,6 +16,11 @@ ...@@ -16,6 +16,11 @@
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
#include "paddle/platform/transform.h"
#ifdef __NVCC__
#include <thrust/iterator/iterator_adaptor.h>
#endif
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
...@@ -54,6 +59,153 @@ inline void get_mid_dims(const framework::DDim& x_dims, ...@@ -54,6 +59,153 @@ inline void get_mid_dims(const framework::DDim& x_dims,
} }
} }
template <typename T, typename Place>
class RowwiseTransformIterator;
template <typename T, typename Place>
class MidWiseTransformIterator;
template <typename T>
class RowwiseTransformIterator<T, platform::CPUPlace> {
public:
RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
RowwiseTransformIterator<T, platform::CPUPlace>& operator++() {
++i_;
i_ %= n_;
return *this;
}
bool operator==(
const RowwiseTransformIterator<T, platform::CPUPlace>& rhs) const {
return (ptr_ + i_) == &(*rhs);
}
bool operator!=(
const RowwiseTransformIterator<T, platform::CPUPlace>& rhs) const {
return (ptr_ + i_) != &(*rhs);
}
const T& operator*() { return ptr_[i_]; }
private:
const T* ptr_;
int i_;
int64_t n_;
};
template <typename T>
class MidWiseTransformIterator<T, platform::CPUPlace> {
public:
MidWiseTransformIterator(const T* ptr, int n, int post)
: ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
MidWiseTransformIterator<T, platform::CPUPlace>& operator++() {
i_ = (++j_ / post_) % n_;
return *this;
}
bool operator==(
const MidWiseTransformIterator<T, platform::CPUPlace>& rhs) const {
return (ptr_ + i_) == &(*rhs);
}
bool operator!=(
const MidWiseTransformIterator<T, platform::CPUPlace>& rhs) const {
return (ptr_ + i_) != &(*rhs);
}
const T& operator*() { return ptr_[i_]; }
private:
const T* ptr_;
int i_;
int64_t j_;
int64_t n_;
int post_;
};
#ifdef __NVCC__
template <typename T>
class RowwiseTransformIterator<T, platform::GPUPlace>
: public thrust::iterator_adaptor<
RowwiseTransformIterator<T, platform::GPUPlace>, const T*> {
public:
typedef thrust::iterator_adaptor<
RowwiseTransformIterator<T, platform::GPUPlace>, const T*>
super_t;
HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
: super_t(x), begin_(x), n_(n){};
friend class thrust::iterator_core_access;
private:
unsigned int n_;
const T* begin_;
HOSTDEVICE typename super_t::reference dereference() const {
return *(begin_ + (this->base() - begin_) % n_);
}
};
template <typename T>
class MidWiseTransformIterator<T, platform::GPUPlace>
: public thrust::iterator_adaptor<
MidWiseTransformIterator<T, platform::GPUPlace>, const T*> {
public:
typedef thrust::iterator_adaptor<
MidWiseTransformIterator<T, platform::GPUPlace>, const T*>
super_t;
HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
: super_t(x), begin_(x), n_(n), post_(post){};
friend class thrust::iterator_core_access;
private:
unsigned int post_;
unsigned int n_;
const T* begin_;
HOSTDEVICE typename super_t::reference dereference() const {
return *(begin_ + (((this->base() - begin_) / post_) % n_));
}
};
#endif
template <typename Functor, typename T, typename Place>
class TransformFunctor {
public:
TransformFunctor(const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z, const platform::DeviceContext& ctx,
Functor func)
: x_(x->data<T>()),
y_(y->data<T>()),
z_(z->mutable_data<T>(ctx.GetPlace())),
nx_(x->numel()),
ctx_(ctx),
func_(func) {}
inline void Run() const {
platform::Transform<Place> trans;
trans(ctx_, x_, x_ + nx_, y_, z_, func_);
}
inline void RunRowWise(int n, int pre) const {
platform::Transform<Place> trans;
trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator<T, Place>(y_, n), z_,
func_);
}
inline void RunMidWise(int n, int pre, int post) const {
platform::Transform<Place> trans;
trans(ctx_, x_, x_ + nx_, MidWiseTransformIterator<T, Place>(y_, n, post),
z_, func_);
}
private:
const T* x_;
const T* y_;
T* z_;
int64_t nx_;
const platform::DeviceContext& ctx_;
Functor func_;
};
#define EIGEN_FUNCTOR(name, eigen_op) \ #define EIGEN_FUNCTOR(name, eigen_op) \
struct Eigen##name##Functor { \ struct Eigen##name##Functor { \
template <typename Place, typename T> \ template <typename Place, typename T> \
...@@ -106,7 +258,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) { ...@@ -106,7 +258,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) {
auto x_dims = x->dims(); auto x_dims = x->dims();
auto y_dims = y->dims(); auto y_dims = y->dims();
PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
"Rank of first input must >= rank of second input.") "Rank of first input must >= rank of second input.");
if (x_dims == y_dims) { if (x_dims == y_dims) {
functor f; functor f;
......
...@@ -71,8 +71,8 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -71,8 +71,8 @@ class GRUKernel : public framework::OpKernel<T> {
int frame_size = hidden_dims[1]; int frame_size = hidden_dims[1];
math::hl_gru_value<T> gru_value; math::hl_gru_value<T> gru_value;
gru_value.gateWeight = const_cast<T*>(weight_data); gru_value.gate_weight = const_cast<T*>(weight_data);
gru_value.stateWeight = gru_value.state_weight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size); const_cast<T*>(weight_data + 2 * frame_size * frame_size);
Tensor ordered_h0; Tensor ordered_h0;
const size_t* order = batch_gate->lod()[2].data(); const size_t* order = batch_gate->lod()[2].data();
...@@ -82,9 +82,9 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -82,9 +82,9 @@ class GRUKernel : public framework::OpKernel<T> {
// to reorder. // to reorder.
ReorderInitState<Place, T>(context.device_context(), *h0, order, ReorderInitState<Place, T>(context.device_context(), *h0, order,
&ordered_h0, true); &ordered_h0, true);
gru_value.prevOutValue = ordered_h0.data<T>(); gru_value.prev_out_value = ordered_h0.data<T>();
} else { } else {
gru_value.prevOutValue = nullptr; gru_value.prev_out_value = nullptr;
} }
auto batch_starts = batch_gate->lod()[0]; auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1; size_t num_batch = batch_starts.size() - 1;
...@@ -96,14 +96,14 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -96,14 +96,14 @@ class GRUKernel : public framework::OpKernel<T> {
Tensor gate_t = batch_gate->Slice(bstart, bend); Tensor gate_t = batch_gate->Slice(bstart, bend);
Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend); Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
Tensor hidden_t = batch_hidden->Slice(bstart, bend); Tensor hidden_t = batch_hidden->Slice(bstart, bend);
gru_value.outputValue = hidden_t.data<T>(); gru_value.output_value = hidden_t.data<T>();
gru_value.gateValue = gate_t.data<T>(); gru_value.gate_value = gate_t.data<T>();
gru_value.resetOutputValue = reset_hidden_prev_t.data<T>(); gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
math::GRUUnitFunctor<Place, T>::compute( math::GRUUnitFunctor<Place, T>::compute(
dev_ctx, gru_value, frame_size, cur_batch_size, dev_ctx, gru_value, frame_size, cur_batch_size,
math::ActiveType(context.Attr<std::string>("activation")), math::ActiveType(context.Attr<std::string>("activation")),
math::ActiveType(context.Attr<std::string>("gate_activation"))); math::ActiveType(context.Attr<std::string>("gate_activation")));
gru_value.prevOutValue = gru_value.outputValue; gru_value.prev_out_value = gru_value.output_value;
} }
math::Batch2LoDTensorFunctor<Place, T> to_seq; math::Batch2LoDTensorFunctor<Place, T> to_seq;
...@@ -169,20 +169,20 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -169,20 +169,20 @@ class GRUGradKernel : public framework::OpKernel<T> {
to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse); to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
math::hl_gru_value<T> gru_value; math::hl_gru_value<T> gru_value;
gru_value.gateWeight = const_cast<T*>(weight_data); gru_value.gate_weight = const_cast<T*>(weight_data);
gru_value.stateWeight = gru_value.state_weight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size); const_cast<T*>(weight_data + 2 * frame_size * frame_size);
math::hl_gru_grad<T> gru_grad; math::hl_gru_grad<T> gru_grad;
if (weight_grad) { if (weight_grad) {
gru_grad.gateWeightGrad = gru_grad.gate_weight_grad =
weight_grad->mutable_data<T>(context.GetPlace()); weight_grad->mutable_data<T>(context.GetPlace());
zero(dev_ctx, weight_grad, static_cast<T>(0.0)); zero(dev_ctx, weight_grad, static_cast<T>(0.0));
gru_grad.stateWeightGrad = gru_grad.state_weight_grad =
weight_grad->data<T>() + 2 * frame_size * frame_size; weight_grad->data<T>() + 2 * frame_size * frame_size;
} else { } else {
gru_grad.gateWeightGrad = nullptr; gru_grad.gate_weight_grad = nullptr;
gru_grad.stateWeightGrad = nullptr; gru_grad.state_weight_grad = nullptr;
} }
auto batch_starts = batch_hidden_grad.lod()[0]; auto batch_starts = batch_hidden_grad.lod()[0];
...@@ -193,27 +193,27 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -193,27 +193,27 @@ class GRUGradKernel : public framework::OpKernel<T> {
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
Tensor gate_t = batch_gate->Slice(bstart, bend); Tensor gate_t = batch_gate->Slice(bstart, bend);
gru_value.gateValue = gate_t.data<T>(); gru_value.gate_value = gate_t.data<T>();
Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend); Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
gru_value.resetOutputValue = reset_hidden_prev_t.data<T>(); gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend); Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
gru_grad.outputGrad = hidden_grad_t.data<T>(); gru_grad.output_grad = hidden_grad_t.data<T>();
Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend); Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
gru_grad.gateGrad = gate_grad_t.data<T>(); gru_grad.gate_grad = gate_grad_t.data<T>();
Tensor reset_hidden_prev_grad_t = Tensor reset_hidden_prev_grad_t =
batch_reset_hidden_prev_grad.Slice(bstart, bend); batch_reset_hidden_prev_grad.Slice(bstart, bend);
gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>(); gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
if (n == 0) { if (n == 0) {
gru_value.prevOutValue = h0 ? ordered_h0.data<T>() : nullptr; gru_value.prev_out_value = h0 ? ordered_h0.data<T>() : nullptr;
gru_grad.prevOutGrad = gru_grad.prev_out_grad =
h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr; h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
} else { } else {
int bstart_pre = static_cast<int>(batch_starts[n - 1]); int bstart_pre = static_cast<int>(batch_starts[n - 1]);
Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart); Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
gru_value.prevOutValue = hidden_prev_t.data<T>(); gru_value.prev_out_value = hidden_prev_t.data<T>();
Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart); Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
gru_grad.prevOutGrad = hidden_prev_grad_t.data<T>(); gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
} }
math::GRUUnitGradFunctor<Place, T>::compute( math::GRUUnitGradFunctor<Place, T>::compute(
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/hinge_loss_op.h"
namespace paddle {
namespace operators {
class HingeLossOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Logits"),
"Input(Logits) must be initialized.");
PADDLE_ENFORCE(ctx->HasInput("Labels"),
"Input(Labels) must be initialized.");
auto pred_dims = ctx->GetInputDim("Logits");
auto label_dims = ctx->GetInputDim("Labels");
PADDLE_ENFORCE_EQ(pred_dims, label_dims);
PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
"The rank of Input(Logits) must be 2 and the shape is "
"[batch_size, 1].");
PADDLE_ENFORCE_EQ(pred_dims[1], 1,
"Each row of Input(Logits) contains a real value, "
"so the 2nd dimension of Input(Logits) must be 1.");
ctx->SetOutputDim("Loss", {pred_dims[0], 1});
ctx->ShareLoD("Logits", "Loss");
}
};
template <typename AttrType>
class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
public:
HingeLossOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Logits",
"The input value (Logits) of Hinge loss op."
"Logits is a 2-D tensor with shape [batch_size, 1].");
AddInput("Labels",
"The target value (Labels) of Hinge loss op."
"Labels is a 2-D tensor with shape [batch_size, 1].");
AddOutput("Loss",
"The output tensor with shape [batch_size, 1] "
"which represents the hinge loss.");
AddComment(R"DOC(
HingeLoss Operator.
Let x be a logit (prediction) and y be the actual label. The logit can
take any values from (-inf, inf), but the labels should be either -1 or 1.
Then, the hinge loss is computed as follows:
$$
L_(x, y) = max(1 - y.x, 0)
$$
Note that the labels passed as input will have values as either 0 or 1.
)DOC");
}
};
class HingeLossGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Logits"),
"Input(Logits) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Labels"),
"Input(Labels) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
"Input(Loss@GRAD) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
"Input(Logits@GRAD) should not be null.");
auto pred_dims = ctx->GetInputDim("Logits");
auto lab_dims = ctx->GetInputDim("Labels");
auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
auto pred_grad_name = framework::GradVarName("Logits");
ctx->SetOutputDim(pred_grad_name, pred_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
hinge_loss_grad, ops::HingeLossGradOp);
REGISTER_OP_CPU_KERNEL(hinge_loss,
ops::HingeLossKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
hinge_loss_grad,
ops::HingeLossGradKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/hinge_loss_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(hinge_loss,
ops::HingeLossKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
hinge_loss_grad,
ops::HingeLossGradKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename Place, typename T, typename AttrType = T>
class HingeLossKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* pred = context.Input<framework::Tensor>("Logits");
auto* label = context.Input<framework::Tensor>("Labels");
auto* loss = context.Output<framework::Tensor>("Loss");
auto place = context.GetEigenDevice<Place>();
auto x = framework::EigenVector<T>::Flatten(*pred);
auto y = framework::EigenVector<T>::Flatten(*label);
loss->mutable_data<T>(context.GetPlace());
auto l = framework::EigenVector<T>::Flatten(*loss);
l.device(place) =
(static_cast<T>(1) - x * (static_cast<T>(2) * y - static_cast<T>(1)))
.cwiseMax(static_cast<T>(0));
}
};
template <typename Place, typename T, typename AttrType = T>
class HingeLossGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* pred = context.Input<framework::Tensor>("Logits");
auto* label = context.Input<framework::Tensor>("Labels");
auto* dloss =
context.Input<framework::Tensor>(framework::GradVarName("Loss"));
auto* dpred =
context.Output<framework::Tensor>(framework::GradVarName("Logits"));
auto place = context.GetEigenDevice<Place>();
auto x = framework::EigenVector<T>::Flatten(*pred);
auto y = framework::EigenVector<T>::Flatten(*label);
auto dl = framework::EigenVector<T>::Flatten(*dloss);
if (dpred) {
dpred->mutable_data<T>(context.GetPlace());
auto dx = framework::EigenVector<T>::Flatten(*dpred);
auto alt_labels = static_cast<T>(2) * y - static_cast<T>(1);
dx.device(place) =
dl * ((x * alt_labels) < static_cast<T>(1)).template cast<T>() *
(-alt_labels);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -61,6 +61,8 @@ class IncrementOp : public framework::OperatorBase { ...@@ -61,6 +61,8 @@ class IncrementOp : public framework::OperatorBase {
out.Resize(x.dims()); out.Resize(x.dims());
out.mutable_data(x.place(), x.type()); out.mutable_data(x.place(), x.type());
float value = Attr<float>("step"); float value = Attr<float>("step");
VLOG(10) << Output("Out") << " increase " << Input("X") << " with "
<< value;
framework::VisitDataType(framework::ToDataType(out.type()), framework::VisitDataType(framework::ToDataType(out.type()),
IncrementFunctor(x, &out, value)); IncrementFunctor(x, &out, value));
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/framework/lod_rank_table.h" #include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/safe_ref.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -32,15 +33,20 @@ class LoDTensorToArrayOp : public framework::OperatorBase { ...@@ -32,15 +33,20 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::DeviceContext &dev_ctx) const override {
auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>(); auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
auto &rank_table = Input("X"))
scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>(); .Get<framework::LoDTensor>();
auto &out = auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")))
*scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>(); .Get<framework::LoDRankTable>();
auto &out = *detail::Ref(scope.FindVar(Output("Out")))
.GetMutable<framework::LoDTensorArray>();
auto &items = rank_table.items(); auto &items = rank_table.items();
auto max_seq_len = items[0].length; auto max_seq_len = items[0].length;
auto rank_level = rank_table.level(); auto rank_level = rank_table.level();
PADDLE_ENFORCE_LT(rank_level, x.lod().size(),
"Input should be a LOD tensor, and size is at least %d",
rank_level + 1);
out.resize(max_seq_len); out.resize(max_seq_len);
std::vector<std::vector<CopyRange>> copy_ranges(max_seq_len); std::vector<std::vector<CopyRange>> copy_ranges(max_seq_len);
...@@ -55,16 +61,13 @@ class LoDTensorToArrayOp : public framework::OperatorBase { ...@@ -55,16 +61,13 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
size_t start_idx = x.lod()[rank_level][item.index] + t; size_t start_idx = x.lod()[rank_level][item.index] + t;
auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
x.lod(), start_idx, start_idx + 1, rank_level + 1); x.lod(), start_idx, start_idx + 1, rank_level + 1);
auto &lod_length = lod_and_offset.first; auto &lod_length = lod_and_offset.first;
framework::AppendLoD(&lod, lod_length); framework::AppendLoD(&lod, lod_length);
size_t start_offset = lod_and_offset.second.first; size_t start_offset = lod_and_offset.second.first;
size_t end_offset = lod_and_offset.second.second; size_t end_offset = lod_and_offset.second.second;
copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset}); copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
} }
} }
for (size_t i = 0; i < max_seq_len; ++i) { for (size_t i = 0; i < max_seq_len; ++i) {
auto &ranges = copy_ranges[i]; auto &ranges = copy_ranges[i];
size_t height = std::accumulate( size_t height = std::accumulate(
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/log_loss_op.h"
namespace paddle {
namespace operators {
class LogLossOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Predicted"),
"Input(Predicted) must be initialized.");
PADDLE_ENFORCE(ctx->HasInput("Labels"),
"Input(Labels) must be initialized.");
auto pred_dims = ctx->GetInputDim("Predicted");
auto label_dims = ctx->GetInputDim("Labels");
PADDLE_ENFORCE_EQ(pred_dims, label_dims);
PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
"The rank of Input(Predicted) must be 2 and the shape is "
"[batch_size, 1].");
PADDLE_ENFORCE_EQ(pred_dims[1], 1,
"Each row of Input(Predicted) contains a real value, "
"so the 2nd dimension of Input(X) must be 1.");
ctx->SetOutputDim("Loss", {pred_dims[0], 1});
ctx->ShareLoD("Predicted", "Loss");
}
};
template <typename AttrType>
class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LogLossOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Predicted",
"The input value (Predicted) of Log loss op."
"Predicted is a 2-D tensor with shape [batch_size, 1].");
AddInput("Labels",
"The target value (Labels) of Log loss op."
"Labels is a 2-D tensor with shape [batch_size, 1].");
AddOutput("Loss",
"The output tensor with shape [batch_size, 1] "
"which represents the log loss.");
AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
AddComment(R"DOC(
LogLoss Operator.
Log loss is a loss function used for binary classification. Log Loss quantifies
the accuracy of a classifier by penalising false classifications. Minimising the
Log Loss is equivalent to maximising the accuracy of the classifier. We define
Predicted as the values predicted by our model and Labels as the target ground
truth value. Log loss can evaluate how close the predicted values are to the
target. The shapes of Predicted and Labels are both [batch_size, 1].
The equation is:
$$
Loss = - Labels * log(Predicted + \epsilon) -
(1 - Labels) * log(1 - Predicted + \epsilon)
$$
)DOC");
}
};
class LogLossGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Predicted"),
"Input(Predicted) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Labels"),
"Input(Labels) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
"Input(Loss@GRAD) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
"Output(Predicted@GRAD) should not be null.");
auto pred_dims = ctx->GetInputDim("Predicted");
auto label_dims = ctx->GetInputDim("Labels");
auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
auto pred_grad_name = framework::GradVarName("Predicted");
ctx->SetOutputDim(pred_grad_name, pred_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
ops::LogLossGradOp);
REGISTER_OP_CPU_KERNEL(log_loss,
ops::LogLossKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
log_loss_grad, ops::LogLossGradKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/log_loss_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(log_loss,
ops::LogLossKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
log_loss_grad, ops::LogLossGradKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename Place, typename T, typename AttrType = T>
class LogLossKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* loss_out = ctx.Output<Tensor>("Loss");
loss_out->mutable_data<T>(ctx.GetPlace());
auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
auto loss = EigenVector<T>::Flatten(*loss_out);
auto place = ctx.GetEigenDevice<Place>();
loss.device(place) = (-(label * (prediction + epsilon).log()) -
((static_cast<T>(1) - label) *
(static_cast<T>(1) - prediction + epsilon).log()));
}
};
template <typename Place, typename T, typename AttrType = T>
class LogLossGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
auto dl = EigenVector<T>::Flatten(*dloss);
auto place = ctx.GetEigenDevice<Place>();
if (dpred) {
dpred->mutable_data<T>(ctx.GetPlace());
auto dx = framework::EigenVector<T>::Flatten(*dpred);
dx.device(place) = dl * (-(label / (prediction + epsilon)) +
((static_cast<T>(1) - label) /
(static_cast<T>(1) - prediction + epsilon)));
}
}
};
} // namespace operators
} // namespace paddle
...@@ -198,27 +198,27 @@ c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\ ...@@ -198,27 +198,27 @@ c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
h_t = o_t \odot act_h(c_t) h_t = o_t \odot act_h(c_t)
$$ $$
where the W terms denote weight matrices (e.g. \f$W_{xi}\f$ is the matrix where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
of weights from the input gate to the input), \f$W_{ic}, W_{fc}, W_{oc}\f$ of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
are diagonal weight matrices for peephole connections. In our implementation, are diagonal weight matrices for peephole connections. In our implementation,
we use vectors to reprenset these diagonal weight matrices. The b terms we use vectors to reprenset these diagonal weight matrices. The b terms
denote bias vectors (\f$b_i\f$ is the input gate bias vector), \f$\sigma\f$ denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
is the non-line activations, such as logistic sigmoid function, and is the non-line activations, such as logistic sigmoid function, and
\f$i, f, o\f$ and \f$c\f$ are the input gate, forget gate, output gate, $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector \f$h\f$. the cell output activation vector $h$.
The \f$\odot\f$ is the element-wise product of the vectors. \f$act_g\f$ and \f$act_h\f$ The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
are the cell input and cell output activation functions and `tanh` is usually are the cell input and cell output activation functions and `tanh` is usually
used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state, used for them. $\tilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state. which is computed based on the current input and the previous hidden state.
Set `use_peepholes` False to disable peephole connection Set `use_peepholes` False to disable peephole connection. The formula
(http://www.bioinf.jku.at/publications/older/2604.pdf). The formula is omitted here, please refer to the paper
is omitted here. http://www.bioinf.jku.at/publications/older/2604.pdf for details.
Note that these \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$ Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
operations on the input \f$x_{t}\f$ are NOT included in this operator. operations on the input $x_{t}$ are NOT included in this operator.
Users can choose to use fully-connect operator before LSTM operator. Users can choose to use fully-connect operator before LSTM operator.
)DOC"); )DOC");
......
...@@ -73,15 +73,15 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -73,15 +73,15 @@ class LSTMKernel : public framework::OpKernel<T> {
T* bias_data = const_cast<T*>(bias->data<T>()); T* bias_data = const_cast<T*>(bias->data<T>());
// the code style in LstmMetaValue will be updated later. // the code style in LstmMetaValue will be updated later.
lstm_value.checkIg = bias_data + 4 * frame_size; lstm_value.check_ig = bias_data + 4 * frame_size;
lstm_value.checkFg = lstm_value.checkIg + frame_size; lstm_value.check_fg = lstm_value.check_ig + frame_size;
lstm_value.checkOg = lstm_value.checkFg + frame_size; lstm_value.check_og = lstm_value.check_fg + frame_size;
} else { } else {
lstm_value.checkIg = nullptr; lstm_value.check_ig = nullptr;
lstm_value.checkFg = nullptr; lstm_value.check_fg = nullptr;
lstm_value.checkOg = nullptr; lstm_value.check_og = nullptr;
} }
lstm_value.prevStateValue = nullptr; lstm_value.prev_state_value = nullptr;
Tensor ordered_c0; Tensor ordered_c0;
const size_t* order = batch_gate->lod()[2].data(); const size_t* order = batch_gate->lod()[2].data();
if (cell_t0) { if (cell_t0) {
...@@ -90,7 +90,7 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -90,7 +90,7 @@ class LSTMKernel : public framework::OpKernel<T> {
// to reorder. // to reorder.
ReorderInitState<Place, T>(device_ctx, *cell_t0, order, &ordered_c0, ReorderInitState<Place, T>(device_ctx, *cell_t0, order, &ordered_c0,
true); true);
lstm_value.prevStateValue = ordered_c0.data<T>(); lstm_value.prev_state_value = ordered_c0.data<T>();
} }
// Use the local variable as here. // Use the local variable as here.
...@@ -140,14 +140,14 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -140,14 +140,14 @@ class LSTMKernel : public framework::OpKernel<T> {
static_cast<T>(1.0)); static_cast<T>(1.0));
} }
lstm_value.gateValue = gate_t.data<T>(); lstm_value.gate_value = gate_t.data<T>();
lstm_value.outputValue = out_t.data<T>(); lstm_value.output_value = out_t.data<T>();
lstm_value.stateValue = cell_t.data<T>(); lstm_value.state_value = cell_t.data<T>();
lstm_value.stateActiveValue = cell_pre_act_t.data<T>(); lstm_value.state_active_value = cell_pre_act_t.data<T>();
math::LstmUnitFunctor<Place, T>::compute(device_ctx, lstm_value, math::LstmUnitFunctor<Place, T>::compute(device_ctx, lstm_value,
frame_size, cur_batch_size, frame_size, cur_batch_size,
gate_act, cell_act, cand_act); gate_act, cell_act, cand_act);
lstm_value.prevStateValue = lstm_value.stateValue; lstm_value.prev_state_value = lstm_value.state_value;
} }
math::Batch2LoDTensorFunctor<Place, T> to_seq; math::Batch2LoDTensorFunctor<Place, T> to_seq;
...@@ -214,13 +214,13 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -214,13 +214,13 @@ class LSTMGradKernel : public framework::OpKernel<T> {
math::LstmMetaValue<T> lstm_value; math::LstmMetaValue<T> lstm_value;
if (bias && ctx.Attr<bool>("use_peepholes")) { if (bias && ctx.Attr<bool>("use_peepholes")) {
T* bias_data = const_cast<T*>(bias->data<T>()); T* bias_data = const_cast<T*>(bias->data<T>());
lstm_value.checkIg = bias_data + 4 * frame_size; lstm_value.check_ig = bias_data + 4 * frame_size;
lstm_value.checkFg = lstm_value.checkIg + frame_size; lstm_value.check_fg = lstm_value.check_ig + frame_size;
lstm_value.checkOg = lstm_value.checkFg + frame_size; lstm_value.check_og = lstm_value.check_fg + frame_size;
} else { } else {
lstm_value.checkIg = nullptr; lstm_value.check_ig = nullptr;
lstm_value.checkFg = nullptr; lstm_value.check_fg = nullptr;
lstm_value.checkOg = nullptr; lstm_value.check_og = nullptr;
} }
math::LstmMetaGrad<T> lstm_grad; math::LstmMetaGrad<T> lstm_grad;
...@@ -231,13 +231,13 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -231,13 +231,13 @@ class LSTMGradKernel : public framework::OpKernel<T> {
} }
if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) { if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
T* bias_g_data = bias_g->data<T>(); T* bias_g_data = bias_g->data<T>();
lstm_grad.checkIgGrad = bias_g_data + 4 * frame_size; lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
lstm_grad.checkFgGrad = lstm_grad.checkIgGrad + frame_size; lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
lstm_grad.checkOgGrad = lstm_grad.checkFgGrad + frame_size; lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
} else { } else {
lstm_grad.checkIgGrad = nullptr; lstm_grad.check_ig_grad = nullptr;
lstm_grad.checkFgGrad = nullptr; lstm_grad.check_fg_grad = nullptr;
lstm_grad.checkOgGrad = nullptr; lstm_grad.check_og_grad = nullptr;
} }
math::LoDTensor2BatchFunctor<Place, T> to_batch; math::LoDTensor2BatchFunctor<Place, T> to_batch;
...@@ -276,26 +276,26 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -276,26 +276,26 @@ class LSTMGradKernel : public framework::OpKernel<T> {
Tensor gate = batch_gate->Slice(bstart, bend); Tensor gate = batch_gate->Slice(bstart, bend);
Tensor cell = batch_cell.Slice(bstart, bend); Tensor cell = batch_cell.Slice(bstart, bend);
Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend); Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
lstm_value.gateValue = gate.data<T>(); lstm_value.gate_value = gate.data<T>();
lstm_value.stateValue = cell.data<T>(); lstm_value.state_value = cell.data<T>();
lstm_value.stateActiveValue = cell_pre_act.data<T>(); lstm_value.state_active_value = cell_pre_act.data<T>();
Tensor out_g = batch_hidden_g.Slice(bstart, bend); Tensor out_g = batch_hidden_g.Slice(bstart, bend);
Tensor gate_g = batch_gate_g.Slice(bstart, bend); Tensor gate_g = batch_gate_g.Slice(bstart, bend);
Tensor cell_g = batch_cell_g.Slice(bstart, bend); Tensor cell_g = batch_cell_g.Slice(bstart, bend);
lstm_grad.stateGrad = cell_g.data<T>(); lstm_grad.state_grad = cell_g.data<T>();
lstm_grad.gateGrad = gate_g.data<T>(); lstm_grad.gate_grad = gate_g.data<T>();
lstm_grad.outputGrad = out_g.data<T>(); lstm_grad.output_grad = out_g.data<T>();
if (n > 0) { if (n > 0) {
int bstart_pre = static_cast<int>(batch_starts[n - 1]); int bstart_pre = static_cast<int>(batch_starts[n - 1]);
Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart); Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart); Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
lstm_value.prevStateValue = cell_pre.data<T>(); lstm_value.prev_state_value = cell_pre.data<T>();
lstm_grad.prevStateGrad = cell_pre_g.data<T>(); lstm_grad.prev_state_grad = cell_pre_g.data<T>();
} else { } else {
lstm_value.prevStateValue = c0 ? ordered_c0.data<T>() : nullptr; lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
lstm_grad.prevStateGrad = c0_g ? ordered_c0_g.data<T>() : nullptr; lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
} }
int cur_batch_size = bend - bstart; int cur_batch_size = bend - bstart;
......
...@@ -13,8 +13,9 @@ if(WITH_GPU) ...@@ -13,8 +13,9 @@ if(WITH_GPU)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
else() else()
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto) cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
...@@ -26,8 +27,9 @@ else() ...@@ -26,8 +27,9 @@ else()
cc_library(context_project SRCS context_project.cc DEPS device_context math_function) cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
cc_library(maxouting SRCS maxouting.cc DEPS device_context) cc_library(maxouting SRCS maxouting.cc DEPS device_context)
cc_library(unpooling SRCS unpooling.cc DEPS device_context)
cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
endif() endif()
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
......
...@@ -25,393 +25,397 @@ namespace detail { ...@@ -25,393 +25,397 @@ namespace detail {
#ifndef __NVCC__ #ifndef __NVCC__
template <class OpResetOutput, typename T> template <class OpResetOutput, typename T>
void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput, void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
T *gateValue, T *resetOutputValue, T *gate_value, T *reset_output_value,
T *prevOutputValue, int frameSize, T *prev_output_value, int frame_size,
activation_mode_t active_gate) { activation_mode_t active_gate) {
T rValueUpdateGate; T r_value_update_gate;
T rValueResetGate; T r_value_reset_gate;
T rValueResetOutput; T r_value_reset_output;
T rPrevOut = 0; T r_prev_out = 0;
T *updateGate = gateValue; T *update_gate = gate_value;
T *resetGate = gateValue + frameSize; T *reset_gate = gate_value + frame_size;
for (int i = 0; i < frameSize; i++) { for (int i = 0; i < frame_size; i++) {
rValueUpdateGate = updateGate[i]; r_value_update_gate = update_gate[i];
rValueResetGate = resetGate[i]; r_value_reset_gate = reset_gate[i];
if (prevOutputValue) { if (prev_output_value) {
rPrevOut = prevOutputValue[i]; r_prev_out = prev_output_value[i];
} }
opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
rValueResetOutput, active_gate); r_value_reset_output, active_gate);
updateGate[i] = rValueUpdateGate; update_gate[i] = r_value_update_gate;
resetGate[i] = rValueResetGate; reset_gate[i] = r_value_reset_gate;
resetOutputValue[i] = rValueResetOutput; reset_output_value[i] = r_value_reset_output;
} }
} }
template <class OpFinalOutput, typename T> template <class OpFinalOutput, typename T>
void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput, void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
T *gateValue, T *prevOutputValue, T *gate_value, T *prev_output_value,
T *outputValue, int frameSize, T *output_value, int frame_size,
activation_mode_t active_node) { activation_mode_t active_node) {
T rValueUpdateGate; T r_value_update_gate;
T rValueFrameState; T r_value_frame_state;
T rPrevOut = 0; T r_prev_out = 0;
T rOutput; T r_output;
T *updateGate = gateValue; T *update_gate = gate_value;
T *frameState = gateValue + frameSize * 2; T *frame_state = gate_value + frame_size * 2;
for (int i = 0; i < frameSize; i++) { for (int i = 0; i < frame_size; i++) {
rValueUpdateGate = updateGate[i]; r_value_update_gate = update_gate[i];
rValueFrameState = frameState[i]; r_value_frame_state = frame_state[i];
if (prevOutputValue) { if (prev_output_value) {
rPrevOut = prevOutputValue[i]; r_prev_out = prev_output_value[i];
} }
opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput, op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
active_node); r_output, active_node);
frameState[i] = rValueFrameState; frame_state[i] = r_value_frame_state;
outputValue[i] = rOutput; output_value[i] = r_output;
} }
} }
template <class OpResetOutput, typename T> template <class OpResetOutput, typename T>
void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue, void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
T *resetOutputValue, T *prevOutputValue, T *gate_value, T *reset_output_value,
int frameSize, T *prev_output_value, int frame_size,
activation_mode_t active_gate) { activation_mode_t active_gate) {
#ifdef __AVX__ #ifdef __AVX__
__m256 rValueUpdateGate; __m256 r_value_update_gate;
__m256 rValueResetGate; __m256 r_value_reset_gate;
__m256 rValueResetOutput; __m256 r_value_reset_output;
__m256 rPrevOut = _mm256_set1_ps(0.0f); __m256 r_prev_out = _mm256_set1_ps(0.0f);
__m256 *updateGate = (__m256 *)gateValue; __m256 *update_gate = (__m256 *)gate_value;
__m256 *resetGate = (__m256 *)(gateValue + frameSize); __m256 *reset_gate = (__m256 *)(gate_value + frame_size);
for (int i = 0; i < frameSize / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
rValueUpdateGate = updateGate[i]; r_value_update_gate = update_gate[i];
rValueResetGate = resetGate[i]; r_value_reset_gate = reset_gate[i];
if (prevOutputValue) { if (prev_output_value) {
rPrevOut = ((__m256 *)prevOutputValue)[i]; r_prev_out = ((__m256 *)prev_output_value)[i];
} }
opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
rValueResetOutput, active_gate); r_value_reset_output, active_gate);
updateGate[i] = rValueUpdateGate; update_gate[i] = r_value_update_gate;
resetGate[i] = rValueResetGate; reset_gate[i] = r_value_reset_gate;
((__m256 *)resetOutputValue)[i] = rValueResetOutput; ((__m256 *)reset_output_value)[i] = r_value_reset_output;
} }
#endif #endif
} }
template <class OpFinalOutput, typename T> template <class OpFinalOutput, typename T>
void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue, void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
T *prevOutputValue, T *outputValue, T *gate_value, T *prev_output_value,
int frameSize, T *output_value, int frame_size,
activation_mode_t active_node) { activation_mode_t active_node) {
#ifdef __AVX__ #ifdef __AVX__
__m256 rValueUpdateGate; __m256 r_value_update_gate;
__m256 rValueFrameState; __m256 r_value_frame_state;
__m256 rPrevOut = _mm256_set1_ps(0.0f); __m256 r_prev_out = _mm256_set1_ps(0.0f);
__m256 rOutput; __m256 r_output;
__m256 *updateGate = (__m256 *)gateValue; __m256 *update_gate = (__m256 *)gate_value;
__m256 *frameState = (__m256 *)(gateValue + frameSize * 2); __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2);
for (int i = 0; i < frameSize / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
rValueUpdateGate = updateGate[i]; r_value_update_gate = update_gate[i];
rValueFrameState = frameState[i]; r_value_frame_state = frame_state[i];
if (prevOutputValue) { if (prev_output_value) {
rPrevOut = ((__m256 *)prevOutputValue)[i]; r_prev_out = ((__m256 *)prev_output_value)[i];
} }
opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput, op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
active_node); r_output, active_node);
frameState[i] = rValueFrameState; frame_state[i] = r_value_frame_state;
((__m256 *)outputValue)[i] = rOutput; ((__m256 *)output_value)[i] = r_output;
} }
#endif #endif
} }
template <class OpResetOutput, typename T> template <class OpResetOutput, typename T>
inline void forward_reset_output(OpResetOutput opResetOutput, inline void forward_reset_output(OpResetOutput op_reset_output,
hl_gru_value<T> value, int frameSize, hl_gru_value<T> value, int frame_size,
int batchSize, activation_mode_t active_gate) { int batch_size,
for (int b = 0; b < batchSize; b++) { activation_mode_t active_gate) {
if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { for (int b = 0; b < batch_size; b++) {
if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_forward_reset_output( hl_avx_gru_forward_reset_output(
opResetOutput, value.gateValue, value.resetOutputValue, op_reset_output, value.gate_value, value.reset_output_value,
value.prevOutValue, frameSize, active_gate); value.prev_out_value, frame_size, active_gate);
} else { } else {
hl_naive_gru_forward_reset_output( hl_naive_gru_forward_reset_output(
opResetOutput, value.gateValue, value.resetOutputValue, op_reset_output, value.gate_value, value.reset_output_value,
value.prevOutValue, frameSize, active_gate); value.prev_out_value, frame_size, active_gate);
} }
value.gateValue += frameSize * 3; value.gate_value += frame_size * 3;
value.resetOutputValue += frameSize; value.reset_output_value += frame_size;
if (value.prevOutValue) { if (value.prev_out_value) {
value.prevOutValue += frameSize; value.prev_out_value += frame_size;
} }
} }
} }
template <class OpFinalOutput, typename T> template <class OpFinalOutput, typename T>
inline void forward_final_output(OpFinalOutput opFinalOutput, inline void forward_final_output(OpFinalOutput op_final_output,
hl_gru_value<T> value, int frameSize, hl_gru_value<T> value, int frame_size,
int batchSize, activation_mode_t active_node) { int batch_size,
for (int b = 0; b < batchSize; b++) { activation_mode_t active_node) {
if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { for (int b = 0; b < batch_size; b++) {
hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue, if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
value.prevOutValue, value.outputValue, hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
frameSize, active_node); value.prev_out_value, value.output_value,
frame_size, active_node);
} else { } else {
hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue, hl_naive_gru_forward_final_output(
value.prevOutValue, value.outputValue, op_final_output, value.gate_value, value.prev_out_value,
frameSize, active_node); value.output_value, frame_size, active_node);
} }
value.gateValue += frameSize * 3; value.gate_value += frame_size * 3;
value.outputValue += frameSize; value.output_value += frame_size;
if (value.prevOutValue) { if (value.prev_out_value) {
value.prevOutValue += frameSize; value.prev_out_value += frame_size;
} }
} }
} }
template <class OpStateGrad, typename T> template <class OpStateGrad, typename T>
void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue, void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
T *gateGrad, T *prevOutValue, T *gate_grad, T *prev_out_value,
T *prevOutGrad, T *outputGrad, T *prev_out_grad, T *output_grad,
int frameSize, int frame_size,
activation_mode_t active_node) { activation_mode_t active_node) {
T rUpdateGateValue; T r_update_gate_value;
T rUpdateGateGrad; T r_update_gate_grad;
T rFrameStateValue; T r_frame_state_value;
T rFrameStateGrad; T r_frame_state_grad;
T rOutGrad; T r_out_grad;
T rPrevOutValue = 0; T r_prev_out_value = 0;
T rPrevOutGrad = 0; T r_prev_out_grad = 0;
T *updateGateValue = gateValue; T *update_gate_value = gate_value;
T *updateGateGrad = gateGrad; T *update_gate_grad = gate_grad;
T *frameStateValue = gateValue + frameSize * 2; T *frame_state_value = gate_value + frame_size * 2;
T *frameStateGrad = gateGrad + frameSize * 2; T *frame_state_grad = gate_grad + frame_size * 2;
for (int i = 0; i < frameSize; i++) { for (int i = 0; i < frame_size; i++) {
rUpdateGateValue = updateGateValue[i]; r_update_gate_value = update_gate_value[i];
rFrameStateValue = frameStateValue[i]; r_frame_state_value = frame_state_value[i];
rOutGrad = outputGrad[i]; r_out_grad = output_grad[i];
if (prevOutValue) { if (prev_out_value) {
rPrevOutValue = prevOutValue[i]; r_prev_out_value = prev_out_value[i];
} }
if (prevOutGrad) { if (prev_out_grad) {
rPrevOutGrad = prevOutGrad[i]; r_prev_out_grad = prev_out_grad[i];
} }
opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue, op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad, r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
active_node); r_out_grad, active_node);
updateGateGrad[i] = rUpdateGateGrad; update_gate_grad[i] = r_update_gate_grad;
frameStateGrad[i] = rFrameStateGrad; frame_state_grad[i] = r_frame_state_grad;
if (prevOutGrad) { if (prev_out_grad) {
prevOutGrad[i] = rPrevOutGrad; prev_out_grad[i] = r_prev_out_grad;
} }
} }
} }
template <class OpResetGrad, typename T> template <class OpResetGrad, typename T>
void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue, void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
T *gateGrad, T *prevOutValue, T *gate_grad, T *prev_out_value,
T *prevOutGrad, T *resetOutputGrad, T *prev_out_grad, T *reset_output_grad,
int frameSize, int frame_size,
activation_mode_t active_gate) { activation_mode_t active_gate) {
T rUpdateGateValue; T r_update_gate_value;
T rUpdateGateGrad; T r_update_gate_grad;
T rResetGateValue; T r_reset_gate_value;
T rResetGateGrad; T r_reset_gate_grad;
T rResetOutputGrad = 0; T r_reset_output_grad = 0;
T rPrevOutValue = 0; T r_prev_out_value = 0;
T rPrevOutGrad = 0; T r_prev_out_grad = 0;
T *updateGateValue = gateValue; T *update_gate_value = gate_value;
T *updateGateGrad = gateGrad; T *update_gate_grad = gate_grad;
T *resetGateValue = gateValue + frameSize; T *reset_gate_value = gate_value + frame_size;
T *resetGateGrad = gateGrad + frameSize; T *reset_gate_grad = gate_grad + frame_size;
for (int i = 0; i < frameSize; i++) { for (int i = 0; i < frame_size; i++) {
rUpdateGateValue = updateGateValue[i]; r_update_gate_value = update_gate_value[i];
rUpdateGateGrad = updateGateGrad[i]; r_update_gate_grad = update_gate_grad[i];
rResetGateValue = resetGateValue[i]; r_reset_gate_value = reset_gate_value[i];
if (prevOutValue && prevOutGrad) { if (prev_out_value && prev_out_grad) {
rResetOutputGrad = resetOutputGrad[i]; r_reset_output_grad = reset_output_grad[i];
} }
if (prevOutValue) { if (prev_out_value) {
rPrevOutValue = prevOutValue[i]; r_prev_out_value = prev_out_value[i];
} }
if (prevOutGrad) { if (prev_out_grad) {
rPrevOutGrad = prevOutGrad[i]; r_prev_out_grad = prev_out_grad[i];
} }
opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue, op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad, r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
active_gate); r_reset_output_grad, active_gate);
updateGateGrad[i] = rUpdateGateGrad; update_gate_grad[i] = r_update_gate_grad;
resetGateGrad[i] = rResetGateGrad; reset_gate_grad[i] = r_reset_gate_grad;
if (prevOutGrad) { if (prev_out_grad) {
prevOutGrad[i] = rPrevOutGrad; prev_out_grad[i] = r_prev_out_grad;
} }
} }
} }
template <class OpStateGrad, typename T> template <class OpStateGrad, typename T>
void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue, void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
T *gateGrad, T *prevOutValue, T *gate_grad, T *prev_out_value,
T *prevOutGrad, T *outputGrad, T *prev_out_grad, T *output_grad,
int frameSize, int frame_size,
activation_mode_t active_node) { activation_mode_t active_node) {
#ifdef __AVX__ #ifdef __AVX__
__m256 rUpdateGateValue; __m256 r_update_gate_value;
__m256 rUpdateGateGrad; __m256 r_update_gate_grad;
__m256 rFrameStateValue; __m256 r_frame_state_value;
__m256 rFrameStateGrad; __m256 r_frame_state_grad;
__m256 rOutGrad; __m256 r_out_grad;
__m256 rPrevOutValue = _mm256_set1_ps(0.0f); __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
__m256 rPrevOutGrad = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
__m256 *updateGateValue = (__m256 *)gateValue; __m256 *update_gate_value = (__m256 *)gate_value;
__m256 *updateGateGrad = (__m256 *)gateGrad; __m256 *update_gate_grad = (__m256 *)gate_grad;
__m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2); __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2);
__m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2); __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2);
for (int i = 0; i < frameSize / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
rUpdateGateValue = updateGateValue[i]; r_update_gate_value = update_gate_value[i];
rFrameStateValue = frameStateValue[i]; r_frame_state_value = frame_state_value[i];
rOutGrad = ((__m256 *)outputGrad)[i]; r_out_grad = ((__m256 *)output_grad)[i];
if (prevOutValue) { if (prev_out_value) {
rPrevOutValue = ((__m256 *)prevOutValue)[i]; r_prev_out_value = ((__m256 *)prev_out_value)[i];
} }
if (prevOutGrad) { if (prev_out_grad) {
rPrevOutGrad = ((__m256 *)prevOutGrad)[i]; r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
} }
opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue, op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad, r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
active_node); r_out_grad, active_node);
updateGateGrad[i] = rUpdateGateGrad; update_gate_grad[i] = r_update_gate_grad;
frameStateGrad[i] = rFrameStateGrad; frame_state_grad[i] = r_frame_state_grad;
if (prevOutGrad) { if (prev_out_grad) {
((__m256 *)prevOutGrad)[i] = rPrevOutGrad; ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
} }
} }
#endif #endif
} }
template <class OpResetGrad, typename T> template <class OpResetGrad, typename T>
void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue, void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
T *gateGrad, T *prevOutValue, T *gate_grad, T *prev_out_value,
T *prevOutGrad, T *resetOutputGrad, T *prev_out_grad, T *reset_output_grad,
int frameSize, int frame_size,
activation_mode_t active_gate) { activation_mode_t active_gate) {
#ifdef __AVX__ #ifdef __AVX__
__m256 rUpdateGateValue; __m256 r_update_gate_value;
__m256 rUpdateGateGrad; __m256 r_update_gate_grad;
__m256 rResetGateValue; __m256 r_reset_gate_value;
__m256 rResetGateGrad; __m256 r_reset_gate_grad;
__m256 rResetOutputGrad = _mm256_set1_ps(0.0f); __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
__m256 rPrevOutValue = _mm256_set1_ps(0.0f); __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
__m256 rPrevOutGrad = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
__m256 *updateGateValue = (__m256 *)gateValue; __m256 *update_gate_value = (__m256 *)gate_value;
__m256 *updateGateGrad = (__m256 *)gateGrad; __m256 *update_gate_grad = (__m256 *)gate_grad;
__m256 *resetGateValue = (__m256 *)(gateValue + frameSize); __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size);
__m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize); __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size);
for (int i = 0; i < frameSize / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
rUpdateGateValue = updateGateValue[i]; r_update_gate_value = update_gate_value[i];
rUpdateGateGrad = updateGateGrad[i]; r_update_gate_grad = update_gate_grad[i];
rResetGateValue = resetGateValue[i]; r_reset_gate_value = reset_gate_value[i];
if (prevOutValue && prevOutGrad) { if (prev_out_value && prev_out_grad) {
rResetOutputGrad = ((__m256 *)resetOutputGrad)[i]; r_reset_output_grad = ((__m256 *)reset_output_grad)[i];
} }
if (prevOutValue) { if (prev_out_value) {
rPrevOutValue = ((__m256 *)prevOutValue)[i]; r_prev_out_value = ((__m256 *)prev_out_value)[i];
} }
if (prevOutGrad) { if (prev_out_grad) {
rPrevOutGrad = ((__m256 *)prevOutGrad)[i]; r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
} }
opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue, op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad, r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
active_gate); r_reset_output_grad, active_gate);
updateGateGrad[i] = rUpdateGateGrad; update_gate_grad[i] = r_update_gate_grad;
resetGateGrad[i] = rResetGateGrad; reset_gate_grad[i] = r_reset_gate_grad;
if (prevOutGrad) { if (prev_out_grad) {
((__m256 *)prevOutGrad)[i] = rPrevOutGrad; ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
} }
} }
#endif #endif
} }
template <class OpStateGrad, typename T> template <class OpStateGrad, typename T>
inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value<T> value, inline void backward_state_grad(OpStateGrad op_state_grad,
hl_gru_grad<T> grad, int frameSize, hl_gru_value<T> value, hl_gru_grad<T> grad,
int batchSize, activation_mode_t active_node) { int frame_size, int batch_size,
for (int b = 0; b < batchSize; b++) { activation_mode_t active_node) {
if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { for (int b = 0; b < batch_size; b++) {
if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_backward_state_grad( hl_avx_gru_backward_state_grad(
opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue, op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
grad.prevOutGrad, grad.outputGrad, frameSize, active_node); grad.prev_out_grad, grad.output_grad, frame_size, active_node);
} else { } else {
hl_naive_gru_backward_state_grad( hl_naive_gru_backward_state_grad(
opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue, op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
grad.prevOutGrad, grad.outputGrad, frameSize, active_node); grad.prev_out_grad, grad.output_grad, frame_size, active_node);
} }
value.gateValue += frameSize * 3; value.gate_value += frame_size * 3;
if (value.prevOutValue) { if (value.prev_out_value) {
value.prevOutValue += frameSize; value.prev_out_value += frame_size;
} }
grad.gateGrad += frameSize * 3; grad.gate_grad += frame_size * 3;
grad.outputGrad += frameSize; grad.output_grad += frame_size;
if (grad.prevOutGrad) { if (grad.prev_out_grad) {
grad.prevOutGrad += frameSize; grad.prev_out_grad += frame_size;
} }
} }
} }
template <class OpResetGrad, typename T> template <class OpResetGrad, typename T>
inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value<T> value, inline void backward_reset_grad(OpResetGrad op_reset_grad,
hl_gru_grad<T> grad, int frameSize, hl_gru_value<T> value, hl_gru_grad<T> grad,
int batchSize, activation_mode_t active_gate) { int frame_size, int batch_size,
for (int b = 0; b < batchSize; b++) { activation_mode_t active_gate) {
if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { for (int b = 0; b < batch_size; b++) {
if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_backward_reset_grad( hl_avx_gru_backward_reset_grad(
opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue, op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate); grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
} else { } else {
hl_naive_gru_backward_reset_grad( hl_naive_gru_backward_reset_grad(
opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue, op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate); grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
} }
value.gateValue += frameSize * 3; value.gate_value += frame_size * 3;
if (value.prevOutValue) { if (value.prev_out_value) {
value.prevOutValue += frameSize; value.prev_out_value += frame_size;
} }
grad.gateGrad += frameSize * 3; grad.gate_grad += frame_size * 3;
grad.resetOutputGrad += frameSize; grad.reset_output_grad += frame_size;
if (grad.prevOutGrad) { if (grad.prev_out_grad) {
grad.prevOutGrad += frameSize; grad.prev_out_grad += frame_size;
} }
} }
} }
......
...@@ -27,174 +27,174 @@ namespace math { ...@@ -27,174 +27,174 @@ namespace math {
namespace detail { namespace detail {
/* /*
* threads(framePerBlock, batchPerBlock) * threads(frame_per_block, batch_per_block)
* grid(frameBlocks, batchBlocks) * grid(frame_blocks, batch_blocks)
*/ */
template <class OpResetOutput, bool isBatch, typename T> template <class OpResetOutput, bool is_batch, typename T>
__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput, __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
T *gateValue, T *resetOutputValue, T *gate_value, T *reset_output_value,
T *prevOutputValue, int frameSize, T *prev_output_value, int frame_size,
int batchSize, int batch_size,
activation_mode_t active_gate) { activation_mode_t active_gate) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frame_idx >= frame_size) return;
int batchIdx = 0; int batch_idx = 0;
if (isBatch) { if (is_batch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y; batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return; if (batch_idx >= batch_size) return;
gateValue += batchIdx * 3 * frameSize; gate_value += batch_idx * 3 * frame_size;
resetOutputValue += batchIdx * frameSize; reset_output_value += batch_idx * frame_size;
} }
T rPrevOut = 0; T r_prev_out = 0;
T rValueResetOutput; T r_value_reset_output;
T rValueUpdateGate = gateValue[frameIdx + frameSize * 0]; T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
T rValueResetGate = gateValue[frameIdx + frameSize * 1]; T r_value_reset_gate = gate_value[frame_idx + frame_size * 1];
if (prevOutputValue) { if (prev_output_value) {
if (isBatch) prevOutputValue += batchIdx * frameSize; if (is_batch) prev_output_value += batch_idx * frame_size;
rPrevOut = prevOutputValue[frameIdx]; r_prev_out = prev_output_value[frame_idx];
} }
opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput, op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
active_gate); r_value_reset_output, active_gate);
gateValue[frameIdx + frameSize * 0] = rValueUpdateGate; gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
gateValue[frameIdx + frameSize * 1] = rValueResetGate; gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
resetOutputValue[frameIdx] = rValueResetOutput; reset_output_value[frame_idx] = r_value_reset_output;
} }
/* /*
* threads(framePerBlock, batchPerBlock) * threads(frame_per_block, batch_per_block)
* grid(frameBlocks, batchBlocks) * grid(frame_blocks, batch_blocks)
*/ */
template <class OpFinalOutput, bool isBatch, typename T> template <class OpFinalOutput, bool is_batch, typename T>
__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput, __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
T *gateValue, T *prevOutputValue, T *gate_value, T *prev_output_value,
T *outputValue, int frameSize, T *output_value, int frame_size,
int batchSize, int batch_size,
activation_mode_t active_node) { activation_mode_t active_node) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frame_idx >= frame_size) return;
int batchIdx = 0; int batch_idx = 0;
if (isBatch) { if (is_batch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y; batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return; if (batch_idx >= batch_size) return;
gateValue += batchIdx * 3 * frameSize; gate_value += batch_idx * 3 * frame_size;
outputValue += batchIdx * frameSize; output_value += batch_idx * frame_size;
} }
T rOutput; T r_output;
T rPrevOut = 0; T r_prev_out = 0;
T rValueUpdateGate = gateValue[frameIdx + frameSize * 0]; T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
T rValueFrameState = gateValue[frameIdx + frameSize * 2]; T r_value_frame_state = gate_value[frame_idx + frame_size * 2];
if (prevOutputValue) { if (prev_output_value) {
if (isBatch) prevOutputValue += batchIdx * frameSize; if (is_batch) prev_output_value += batch_idx * frame_size;
rPrevOut = prevOutputValue[frameIdx]; r_prev_out = prev_output_value[frame_idx];
} }
opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput, op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
active_node); r_output, active_node);
gateValue[frameIdx + frameSize * 2] = rValueFrameState; gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
outputValue[frameIdx] = rOutput; output_value[frame_idx] = r_output;
} }
/* /*
* threads(framePerBlock, batchPerBlock) * threads(frame_per_block, batch_per_block)
* grid(frameBlocks, batchBlocks) * grid(frame_blocks, batch_blocks)
*/ */
template <class OpStateGrad, bool isBatch, typename T> template <class OpStateGrad, bool is_batch, typename T>
__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue, __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
T *gateGrad, T *prevOutValue, T *gate_grad, T *prev_out_value,
T *prevOutGrad, T *outputGrad, T *prev_out_grad, T *output_grad,
int frameSize, int batchSize, int frame_size, int batch_size,
activation_mode_t active_node) { activation_mode_t active_node) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frame_idx >= frame_size) return;
int batchIdx = 0; int batch_idx = 0;
if (isBatch) { if (is_batch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y; batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return; if (batch_idx >= batch_size) return;
gateValue += batchIdx * 3 * frameSize; gate_value += batch_idx * 3 * frame_size;
gateGrad += batchIdx * 3 * frameSize; gate_grad += batch_idx * 3 * frame_size;
outputGrad += batchIdx * frameSize; output_grad += batch_idx * frame_size;
} }
T rUpdateGateGrad; T r_update_gate_grad;
T rFrameStateGrad; T r_frame_state_grad;
T rPrevOutValue = 0; T r_prev_out_value = 0;
T rPrevOutGrad = 0; T r_prev_out_grad = 0;
T rUpdateGateValue = gateValue[frameIdx + frameSize * 0]; T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
T rFrameStateValue = gateValue[frameIdx + frameSize * 2]; T r_frame_state_value = gate_value[frame_idx + frame_size * 2];
T rOutGrad = outputGrad[frameIdx]; T r_out_grad = output_grad[frame_idx];
if (prevOutValue && prevOutGrad) { if (prev_out_value && prev_out_grad) {
if (isBatch) prevOutValue += batchIdx * frameSize; if (is_batch) prev_out_value += batch_idx * frame_size;
rPrevOutValue = prevOutValue[frameIdx]; r_prev_out_value = prev_out_value[frame_idx];
if (isBatch) prevOutGrad += batchIdx * frameSize; if (is_batch) prev_out_grad += batch_idx * frame_size;
rPrevOutGrad = prevOutGrad[frameIdx]; r_prev_out_grad = prev_out_grad[frame_idx];
} }
opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue, op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad, r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
active_node); r_out_grad, active_node);
gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad; gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad; gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
if (prevOutGrad) { if (prev_out_grad) {
prevOutGrad[frameIdx] = rPrevOutGrad; prev_out_grad[frame_idx] = r_prev_out_grad;
} }
} }
/* /*
* threads(framePerBlock, batchPerBlock) * threads(frame_per_block, batch_per_block)
* grid(frameBlocks, batchBlocks) * grid(frame_blocks, batch_blocks)
*/ */
template <class OpResetGrad, bool isBatch, typename T> template <class OpResetGrad, bool is_batch, typename T>
__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue, __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
T *gateGrad, T *prevOutValue, T *gate_grad, T *prev_out_value,
T *prevOutGrad, T *resetOutputGrad, T *prev_out_grad, T *reset_output_grad,
int frameSize, int batchSize, int frame_size, int batch_size,
activation_mode_t active_gate) { activation_mode_t active_gate) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frame_idx >= frame_size) return;
int batchIdx = 0; int batch_idx = 0;
if (isBatch) { if (is_batch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y; batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return; if (batch_idx >= batch_size) return;
gateValue += batchIdx * 3 * frameSize; gate_value += batch_idx * 3 * frame_size;
gateGrad += batchIdx * 3 * frameSize; gate_grad += batch_idx * 3 * frame_size;
resetOutputGrad += batchIdx * frameSize; reset_output_grad += batch_idx * frame_size;
} }
T rResetGateGrad; T r_reset_gate_grad;
T rPrevOutValue = 0; T r_prev_out_value = 0;
T rPrevOutGrad = 0; T r_prev_out_grad = 0;
T rResetOutputGrad = 0; T r_reset_output_grad = 0;
T rUpdateGateValue = gateValue[frameIdx + frameSize * 0]; T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0]; T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0];
T rResetGateValue = gateValue[frameIdx + frameSize * 1]; T r_reset_gate_value = gate_value[frame_idx + frame_size * 1];
if (prevOutValue && prevOutGrad) { if (prev_out_value && prev_out_grad) {
if (isBatch) prevOutValue += batchIdx * frameSize; if (is_batch) prev_out_value += batch_idx * frame_size;
if (isBatch) prevOutGrad += batchIdx * frameSize; if (is_batch) prev_out_grad += batch_idx * frame_size;
rPrevOutValue = prevOutValue[frameIdx]; r_prev_out_value = prev_out_value[frame_idx];
rPrevOutGrad = prevOutGrad[frameIdx]; r_prev_out_grad = prev_out_grad[frame_idx];
rResetOutputGrad = resetOutputGrad[frameIdx]; r_reset_output_grad = reset_output_grad[frame_idx];
} }
opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue, op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad, r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
active_gate); r_reset_output_grad, active_gate);
gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad; gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
gateGrad[frameIdx + frameSize * 1] = rResetGateGrad; gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
if (prevOutGrad) { if (prev_out_grad) {
prevOutGrad[frameIdx] = rPrevOutGrad; prev_out_grad[frame_idx] = r_prev_out_grad;
} }
} }
} // namespace detail } // namespace detail
......
...@@ -28,23 +28,25 @@ namespace forward { ...@@ -28,23 +28,25 @@ namespace forward {
template <typename T> template <typename T>
class gru_resetOutput { class gru_resetOutput {
public: public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut, HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
T &valueResetOutput, activation_mode_t actGate) { T &prev_out, T &value_reset_output,
valueUpdateGate = activation(valueUpdateGate, actGate); activation_mode_t act_gate) {
valueResetGate = activation(valueResetGate, actGate); value_update_gate = activation(value_update_gate, act_gate);
valueResetOutput = prevOut * valueResetGate; value_reset_gate = activation(value_reset_gate, act_gate);
value_reset_output = prev_out * value_reset_gate;
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ #ifndef __AVX__
static const bool avx = false; static const bool avx = false;
#else #else
static const bool avx = true; static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate, HOSTDEVICE void operator()(__m256 &value_update_gate,
__m256 &prevOut, __m256 &valueResetOutput, __m256 &value_reset_gate, __m256 &prev_out,
activation_mode_t actGate) { __m256 &value_reset_output,
valueUpdateGate = activation(valueUpdateGate, actGate); activation_mode_t act_gate) {
valueResetGate = activation(valueResetGate, actGate); value_update_gate = activation(value_update_gate, act_gate);
valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate); value_reset_gate = activation(value_reset_gate, act_gate);
value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
} }
#endif #endif
#endif #endif
...@@ -53,24 +55,26 @@ class gru_resetOutput { ...@@ -53,24 +55,26 @@ class gru_resetOutput {
template <typename T> template <typename T>
class gru_finalOutput { class gru_finalOutput {
public: public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut, HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
T &valueOutput, activation_mode_t actInput) { T &prev_out, T &value_output,
valueFrameState = activation(valueFrameState, actInput); activation_mode_t act_input) {
valueOutput = prevOut - (valueUpdateGate * prevOut) + value_frame_state = activation(value_frame_state, act_input);
(valueUpdateGate * valueFrameState); value_output = prev_out - (value_update_gate * prev_out) +
(value_update_gate * value_frame_state);
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ #ifndef __AVX__
static const bool avx = false; static const bool avx = false;
#else #else
static const bool avx = true; static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState, HOSTDEVICE void operator()(__m256 &value_update_gate,
__m256 &prevOut, __m256 &valueOutput, __m256 &value_frame_state, __m256 &prev_out,
activation_mode_t actInput) { __m256 &value_output,
valueFrameState = activation(valueFrameState, actInput); activation_mode_t act_input) {
valueOutput = _mm256_add_ps( value_frame_state = activation(value_frame_state, act_input);
_mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)), value_output = _mm256_add_ps(
_mm256_mul_ps(valueUpdateGate, valueFrameState)); _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
_mm256_mul_ps(value_update_gate, value_frame_state));
} }
#endif #endif
#endif #endif
...@@ -82,34 +86,37 @@ namespace backward { ...@@ -82,34 +86,37 @@ namespace backward {
template <typename T> template <typename T>
class gru_stateGrad { class gru_stateGrad {
public: public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate, HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
T &valueFrameState, T &gradFrameState, T &value_frame_state, T &grad_frame_state,
T &valuePrevOut, T &gradPrevOut, T &gradOutput, T &value_prev_out, T &grad_prev_out,
activation_mode_t actInput) { T &grad_output, activation_mode_t act_input) {
gradUpdateGate = (gradOutput * valueFrameState); grad_update_gate = (grad_output * value_frame_state);
gradUpdateGate -= (gradOutput * valuePrevOut); grad_update_gate -= (grad_output * value_prev_out);
gradPrevOut -= (gradOutput * valueUpdateGate); grad_prev_out -= (grad_output * value_update_gate);
gradPrevOut += gradOutput; grad_prev_out += grad_output;
gradFrameState = grad_frame_state = activation(grad_output * value_update_gate,
activation(gradOutput * valueUpdateGate, valueFrameState, actInput); value_frame_state, act_input);
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ #ifndef __AVX__
static const bool avx = false; static const bool avx = false;
#else #else
static const bool avx = true; static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate, HOSTDEVICE void operator()(__m256 &value_update_gate,
__m256 &valueFrameState, __m256 &gradFrameState, __m256 &grad_update_gate,
__m256 &valuePrevOut, __m256 &gradPrevOut, __m256 &value_frame_state,
__m256 &gradOutput, activation_mode_t actInput) { __m256 &grad_frame_state, __m256 &value_prev_out,
gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState); __m256 &grad_prev_out, __m256 &grad_output,
gradUpdateGate = activation_mode_t act_input) {
_mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut)); grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
gradPrevOut = _mm256_add_ps( grad_update_gate = _mm256_sub_ps(
_mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)), grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
gradOutput); grad_prev_out = _mm256_add_ps(
gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate), _mm256_sub_ps(grad_prev_out,
valueFrameState, actInput); _mm256_mul_ps(grad_output, value_update_gate)),
grad_output);
grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
value_frame_state, act_input);
} }
#endif #endif
#endif #endif
...@@ -118,30 +125,32 @@ class gru_stateGrad { ...@@ -118,30 +125,32 @@ class gru_stateGrad {
template <typename T> template <typename T>
class gru_resetGrad { class gru_resetGrad {
public: public:
HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate, HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
T &valueResetGate, T &gradResetGate, T &value_reset_gate, T &grad_reset_gate,
T &valuePrevOut, T &gradPrevOut, T &value_prev_out, T &grad_prev_out,
T &gradResetOutput, activation_mode_t actGate) { T &grad_reset_output, activation_mode_t act_gate) {
gradResetGate = (gradResetOutput * valuePrevOut); grad_reset_gate = (grad_reset_output * value_prev_out);
gradPrevOut += (gradResetOutput * valueResetGate); grad_prev_out += (grad_reset_output * value_reset_gate);
gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate); grad_update_gate =
gradResetGate = activation(gradResetGate, valueResetGate, actGate); activation(grad_update_gate, value_update_gate, act_gate);
grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ #ifndef __AVX__
static const bool avx = false; static const bool avx = false;
#else #else
static const bool avx = true; static const bool avx = true;
HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate, HOSTDEVICE void operator()(__m256 &value_update_gate,
__m256 &valueResetGate, __m256 &gradResetGate, __m256 &grad_update_gate, __m256 &value_reset_gate,
__m256 &valuePrevOut, __m256 &gradPrevOut, __m256 &grad_reset_gate, __m256 &value_prev_out,
__m256 &gradResetOutput, __m256 &grad_prev_out, __m256 &grad_reset_output,
activation_mode_t actGate) { activation_mode_t act_gate) {
gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut); grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
gradPrevOut = _mm256_add_ps(gradPrevOut, grad_prev_out = _mm256_add_ps(
_mm256_mul_ps(gradResetOutput, valueResetGate)); grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate); grad_update_gate =
gradResetGate = activation(gradResetGate, valueResetGate, actGate); activation(grad_update_gate, value_update_gate, act_gate);
grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
} }
#endif #endif
#endif #endif
......
...@@ -26,278 +26,284 @@ namespace detail { ...@@ -26,278 +26,284 @@ namespace detail {
template <class T, class Op> template <class T, class Op>
void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frameSize, int frame_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
T rValueIn; T r_value_in;
T rValueIg; T r_value_ig;
T rValueFg; T r_value_fg;
T rValueOg; T r_value_og;
T rCheckI; T r_checkI;
T rCheckF; T r_checkF;
T rCheckO; T r_checkO;
T rState; T r_state;
T rPrevState = 0; T r_prev_state = 0;
T rStateAtv; T r_state_atv;
T rOut; T r_out;
T *valueIn = value.gateValue; T *value_in = value.gate_value;
T *valueIg = value.gateValue + frameSize; T *value_ig = value.gate_value + frame_size;
T *valueFg = value.gateValue + frameSize * 2; T *value_fg = value.gate_value + frame_size * 2;
T *valueOg = value.gateValue + frameSize * 3; T *value_og = value.gate_value + frame_size * 3;
for (int i = 0; i < frameSize; i++) { for (int i = 0; i < frame_size; i++) {
rValueIn = valueIn[i]; r_value_in = value_in[i];
rValueIg = valueIg[i]; r_value_ig = value_ig[i];
rValueFg = valueFg[i]; r_value_fg = value_fg[i];
rValueOg = valueOg[i]; r_value_og = value_og[i];
rCheckI = value.checkIg ? value.checkIg[i] : 0; r_checkI = value.check_ig ? value.check_ig[i] : 0;
rCheckF = value.checkFg ? value.checkFg[i] : 0; r_checkF = value.check_fg ? value.check_fg[i] : 0;
rCheckO = value.checkOg ? value.checkOg[i] : 0; r_checkO = value.check_og ? value.check_og[i] : 0;
if (value.prevStateValue) { if (value.prev_state_value) {
rPrevState = value.prevStateValue[i]; r_prev_state = value.prev_state_value[i];
} }
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state); r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
active_gate, active_state);
valueIn[i] = rValueIn; value_in[i] = r_value_in;
valueIg[i] = rValueIg; value_ig[i] = r_value_ig;
valueFg[i] = rValueFg; value_fg[i] = r_value_fg;
valueOg[i] = rValueOg; value_og[i] = r_value_og;
value.stateValue[i] = rState; value.state_value[i] = r_state;
value.stateActiveValue[i] = rStateAtv; value.state_active_value[i] = r_state_atv;
value.outputValue[i] = rOut; value.output_value[i] = r_out;
} }
} }
template <class T, class Op> template <class T, class Op>
void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize, LstmMetaGrad<T> grad, int frame_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
T rValueIn; T r_value_in;
T rValueIg; T r_value_ig;
T rValueFg; T r_value_fg;
T rValueOg; T r_value_og;
T rGradIn; T r_grad_in;
T rGradIg; T r_grad_ig;
T rGradFg; T r_grad_fg;
T rGradOg; T r_grad_og;
T rPrevState = 0; T r_prev_state = 0;
T rPrevStateGrad; T r_prev_state_grad;
T rState; T r_state;
T rStateGrad; T r_state_grad;
T rStateAtv; T r_state_atv;
T rOutputGrad; T r_output_grad;
T rCheckI; T r_checkI;
T rCheckF; T r_checkF;
T rCheckO; T r_checkO;
T rCheckIGrad; T r_checkIGrad;
T rCheckFGrad; T r_checkFGrad;
T rCheckOGrad; T r_checkOGrad;
T *valueIn = value.gateValue; T *value_in = value.gate_value;
T *valueIg = value.gateValue + frameSize; T *value_ig = value.gate_value + frame_size;
T *valueFg = value.gateValue + frameSize * 2; T *value_fg = value.gate_value + frame_size * 2;
T *valueOg = value.gateValue + frameSize * 3; T *value_og = value.gate_value + frame_size * 3;
T *gradIn = grad.gateGrad; T *grad_in = grad.gate_grad;
T *gradIg = grad.gateGrad + frameSize; T *grad_ig = grad.gate_grad + frame_size;
T *gradFg = grad.gateGrad + frameSize * 2; T *grad_fg = grad.gate_grad + frame_size * 2;
T *gradOg = grad.gateGrad + frameSize * 3; T *grad_og = grad.gate_grad + frame_size * 3;
for (int i = 0; i < frameSize; i++) { for (int i = 0; i < frame_size; i++) {
rValueIn = valueIn[i]; r_value_in = value_in[i];
rValueIg = valueIg[i]; r_value_ig = value_ig[i];
rValueFg = valueFg[i]; r_value_fg = value_fg[i];
rValueOg = valueOg[i]; r_value_og = value_og[i];
rCheckI = value.checkIg ? value.checkIg[i] : 0; r_checkI = value.check_ig ? value.check_ig[i] : 0;
rCheckF = value.checkFg ? value.checkFg[i] : 0; r_checkF = value.check_fg ? value.check_fg[i] : 0;
rCheckO = value.checkOg ? value.checkOg[i] : 0; r_checkO = value.check_og ? value.check_og[i] : 0;
rState = value.stateValue[i]; r_state = value.state_value[i];
rStateAtv = value.stateActiveValue[i]; r_state_atv = value.state_active_value[i];
rOutputGrad = grad.outputGrad[i]; r_output_grad = grad.output_grad[i];
rStateGrad = grad.stateGrad[i]; r_state_grad = grad.state_grad[i];
if (value.prevStateValue) { if (value.prev_state_value) {
rPrevState = value.prevStateValue[i]; r_prev_state = value.prev_state_value[i];
} }
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
rCheckOGrad, active_node, active_gate, active_state); r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
active_state);
gradIn[i] = rGradIn;
gradIg[i] = rGradIg; grad_in[i] = r_grad_in;
gradFg[i] = rGradFg; grad_ig[i] = r_grad_ig;
gradOg[i] = rGradOg; grad_fg[i] = r_grad_fg;
grad.stateGrad[i] = rStateGrad; grad_og[i] = r_grad_og;
grad.state_grad[i] = r_state_grad;
if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad;
if (value.prevStateValue) { if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad;
if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad; if (value.prev_state_value) {
if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad; if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad;
if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad;
} }
if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad; if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad;
} }
} }
template <class T, class Op> template <class T, class Op>
void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize, void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frame_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
#ifdef __AVX__ #ifdef __AVX__
__m256 rValueIn; __m256 r_value_in;
__m256 rValueIg; __m256 r_value_ig;
__m256 rValueFg; __m256 r_value_fg;
__m256 rValueOg; __m256 r_value_og;
__m256 rCheckI = _mm256_set1_ps(0.0f); __m256 r_checkI = _mm256_set1_ps(0.0f);
__m256 rCheckF = _mm256_set1_ps(0.0f); __m256 r_checkF = _mm256_set1_ps(0.0f);
__m256 rCheckO = _mm256_set1_ps(0.0f); __m256 r_checkO = _mm256_set1_ps(0.0f);
__m256 rState; __m256 r_state;
__m256 rPrevState = _mm256_set1_ps(0.0f); __m256 r_prev_state = _mm256_set1_ps(0.0f);
__m256 rStateAtv; __m256 r_state_atv;
__m256 rOut; __m256 r_out;
__m256 *valueIn = (__m256 *)value.gateValue; __m256 *value_in = (__m256 *)value.gate_value;
__m256 *valueIg = (__m256 *)(value.gateValue + frameSize); __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
__m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2); __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
__m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3); __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
for (int i = 0; i < frameSize / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
rValueIn = valueIn[i]; r_value_in = value_in[i];
rValueIg = valueIg[i]; r_value_ig = value_ig[i];
rValueFg = valueFg[i]; r_value_fg = value_fg[i];
rValueOg = valueOg[i]; r_value_og = value_og[i];
if (value.checkIg) { if (value.check_ig) {
rCheckI = ((__m256 *)value.checkIg)[i]; r_checkI = ((__m256 *)value.check_ig)[i];
rCheckF = ((__m256 *)value.checkFg)[i]; r_checkF = ((__m256 *)value.check_fg)[i];
rCheckO = ((__m256 *)value.checkOg)[i]; r_checkO = ((__m256 *)value.check_og)[i];
} }
if (value.prevStateValue) { if (value.prev_state_value) {
rPrevState = ((__m256 *)value.prevStateValue)[i]; r_prev_state = ((__m256 *)value.prev_state_value)[i];
} }
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state); r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
active_gate, active_state);
valueIn[i] = rValueIn; value_in[i] = r_value_in;
valueIg[i] = rValueIg; value_ig[i] = r_value_ig;
valueFg[i] = rValueFg; value_fg[i] = r_value_fg;
valueOg[i] = rValueOg; value_og[i] = r_value_og;
((__m256 *)value.stateValue)[i] = rState; ((__m256 *)value.state_value)[i] = r_state;
((__m256 *)value.stateActiveValue)[i] = rStateAtv; ((__m256 *)value.state_active_value)[i] = r_state_atv;
((__m256 *)value.outputValue)[i] = rOut; ((__m256 *)value.output_value)[i] = r_out;
} }
#endif #endif
} }
template <class T, class Op> template <class T, class Op>
void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value, void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize, LstmMetaGrad<T> grad, int frame_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
#ifdef __AVX__ #ifdef __AVX__
__m256 rValueIn; __m256 r_value_in;
__m256 rValueIg; __m256 r_value_ig;
__m256 rValueFg; __m256 r_value_fg;
__m256 rValueOg; __m256 r_value_og;
__m256 rGradIn; __m256 r_grad_in;
__m256 rGradIg; __m256 r_grad_ig;
__m256 rGradFg; __m256 r_grad_fg;
__m256 rGradOg; __m256 r_grad_og;
__m256 rPrevState = _mm256_set1_ps(0.0f); __m256 r_prev_state = _mm256_set1_ps(0.0f);
__m256 rPrevStateGrad; __m256 r_prev_state_grad;
__m256 rStateGrad; __m256 r_state_grad;
__m256 rState; __m256 r_state;
__m256 rStateAtv; __m256 r_state_atv;
__m256 rOutputGrad; __m256 r_output_grad;
__m256 rCheckI = _mm256_set1_ps(0.0f); __m256 r_checkI = _mm256_set1_ps(0.0f);
__m256 rCheckF = _mm256_set1_ps(0.0f); __m256 r_checkF = _mm256_set1_ps(0.0f);
__m256 rCheckO = _mm256_set1_ps(0.0f); __m256 r_checkO = _mm256_set1_ps(0.0f);
__m256 rCheckIGrad; __m256 r_checkIGrad;
__m256 rCheckFGrad; __m256 r_checkFGrad;
__m256 rCheckOGrad; __m256 r_checkOGrad;
__m256 *valueIn = (__m256 *)value.gateValue; __m256 *value_in = (__m256 *)value.gate_value;
__m256 *valueIg = (__m256 *)(value.gateValue + frameSize); __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
__m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2); __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
__m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3); __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
__m256 *gradIn = (__m256 *)grad.gateGrad; __m256 *grad_in = (__m256 *)grad.gate_grad;
__m256 *gradIg = (__m256 *)(grad.gateGrad + frameSize); __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size);
__m256 *gradFg = (__m256 *)(grad.gateGrad + frameSize * 2); __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2);
__m256 *gradOg = (__m256 *)(grad.gateGrad + frameSize * 3); __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3);
for (int i = 0; i < frameSize / 8; i++) { for (int i = 0; i < frame_size / 8; i++) {
rValueIn = valueIn[i]; r_value_in = value_in[i];
rValueIg = valueIg[i]; r_value_ig = value_ig[i];
rValueFg = valueFg[i]; r_value_fg = value_fg[i];
rValueOg = valueOg[i]; r_value_og = value_og[i];
if (value.checkIg) { if (value.check_ig) {
rCheckI = ((__m256 *)value.checkIg)[i]; r_checkI = ((__m256 *)value.check_ig)[i];
rCheckF = ((__m256 *)value.checkFg)[i]; r_checkF = ((__m256 *)value.check_fg)[i];
rCheckO = ((__m256 *)value.checkOg)[i]; r_checkO = ((__m256 *)value.check_og)[i];
} }
rState = ((__m256 *)value.stateValue)[i]; r_state = ((__m256 *)value.state_value)[i];
rStateAtv = ((__m256 *)value.stateActiveValue)[i]; r_state_atv = ((__m256 *)value.state_active_value)[i];
rOutputGrad = ((__m256 *)grad.outputGrad)[i]; r_output_grad = ((__m256 *)grad.output_grad)[i];
rStateGrad = ((__m256 *)grad.stateGrad)[i]; r_state_grad = ((__m256 *)grad.state_grad)[i];
if (value.prevStateValue) { if (value.prev_state_value) {
rPrevState = ((__m256 *)value.prevStateValue)[i]; r_prev_state = ((__m256 *)value.prev_state_value)[i];
} }
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
rCheckOGrad, active_node, active_gate, active_state); r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
active_state);
gradIn[i] = rGradIn;
gradIg[i] = rGradIg; grad_in[i] = r_grad_in;
gradFg[i] = rGradFg; grad_ig[i] = r_grad_ig;
gradOg[i] = rGradOg; grad_fg[i] = r_grad_fg;
((__m256 *)grad.stateGrad)[i] = rStateGrad; grad_og[i] = r_grad_og;
((__m256 *)grad.state_grad)[i] = r_state_grad;
if (grad.prevStateGrad) ((__m256 *)grad.prevStateGrad)[i] = rPrevStateGrad;
if (value.prevStateValue) { if (grad.prev_state_grad)
if (grad.checkIgGrad) ((__m256 *)grad.checkIgGrad)[i] += rCheckIGrad; ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad;
if (grad.checkFgGrad) ((__m256 *)grad.checkFgGrad)[i] += rCheckFGrad; if (value.prev_state_value) {
if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad;
if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad;
} }
if (grad.checkOgGrad) ((__m256 *)grad.checkOgGrad)[i] += rCheckOGrad; if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad;
} }
#endif #endif
} }
template <class T, class Op> template <class T, class Op>
void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frameSize, void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) { if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_forward_one_sequence<T>(op, value, frameSize, active_node, avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
active_gate, active_state); active_gate, active_state);
} else { } else {
naive_lstm_forward_one_sequence<T>(op, value, frameSize, active_node, naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
active_gate, active_state); active_gate, active_state);
} }
} }
template <class T, class Op> template <class T, class Op>
void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad, void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frameSize, activation_mode_t active_node, int frame_size, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) { if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node, avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
active_gate, active_state); active_gate, active_state);
} else { } else {
naive_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node, naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
active_gate, active_state); active_node, active_gate, active_state);
} }
} }
......
...@@ -26,189 +26,192 @@ namespace math { ...@@ -26,189 +26,192 @@ namespace math {
namespace detail { namespace detail {
/* /*
* threads(framePerBlock, batchPerBlock) * threads(frame_per_block, batch_per_block)
* grid(frameBlocks, batchBlocks) * grid(frame_blocks, batch_blocks)
*/ */
template <class T, class Op, bool isBatch> template <class T, class Op, bool is_batch>
__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frameSize, __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
int batchSize, activation_mode_t active_node, int batch_size, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frame_idx >= frame_size) return;
int batchIdx = 0; int batch_idx = 0;
if (isBatch) { if (is_batch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y; batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return; if (batch_idx >= batch_size) return;
value.gateValue += batchIdx * frameSize * 4; value.gate_value += batch_idx * frame_size * 4;
value.outputValue += batchIdx * frameSize; value.output_value += batch_idx * frame_size;
value.stateValue += batchIdx * frameSize; value.state_value += batch_idx * frame_size;
value.stateActiveValue += batchIdx * frameSize; value.state_active_value += batch_idx * frame_size;
} }
T rState; T r_state;
T rPrevState = 0; T r_prev_state = 0;
T rStateAtv; T r_state_atv;
T rOut; T r_out;
T rValueIn; T r_value_in;
T rValueIg; T r_value_ig;
T rValueFg; T r_value_fg;
T rValueOg; T r_value_og;
T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0; T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0; T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0; T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
rValueIn = value.gateValue[frameIdx]; r_value_in = value.gate_value[frame_idx];
rValueIg = value.gateValue[frameIdx + frameSize]; r_value_ig = value.gate_value[frame_idx + frame_size];
rValueFg = value.gateValue[frameIdx + frameSize * 2]; r_value_fg = value.gate_value[frame_idx + frame_size * 2];
rValueOg = value.gateValue[frameIdx + frameSize * 3]; r_value_og = value.gate_value[frame_idx + frame_size * 3];
if (value.prevStateValue) { if (value.prev_state_value) {
if (isBatch) value.prevStateValue += batchIdx * frameSize; if (is_batch) value.prev_state_value += batch_idx * frame_size;
rPrevState = value.prevStateValue[frameIdx]; r_prev_state = value.prev_state_value[frame_idx];
} }
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state); r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate,
active_state);
value.gateValue[frameIdx] = rValueIn; value.gate_value[frame_idx] = r_value_in;
value.gateValue[frameIdx + frameSize] = rValueIg; value.gate_value[frame_idx + frame_size] = r_value_ig;
value.gateValue[frameIdx + frameSize * 2] = rValueFg; value.gate_value[frame_idx + frame_size * 2] = r_value_fg;
value.gateValue[frameIdx + frameSize * 3] = rValueOg; value.gate_value[frame_idx + frame_size * 3] = r_value_og;
value.stateValue[frameIdx] = rState; value.state_value[frame_idx] = r_state;
value.stateActiveValue[frameIdx] = rStateAtv; value.state_active_value[frame_idx] = r_state_atv;
value.outputValue[frameIdx] = rOut; value.output_value[frame_idx] = r_out;
} }
/* /*
* threads(framePerBlock, batchPerBlock) * threads(frame_per_block, batch_per_block)
* grid(frameBlocks, batchBlocks) * grid(frame_blocks, batch_blocks)
*/ */
template <class T, class Op, bool isBatch> template <class T, class Op, bool is_batch>
__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value, __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize, LstmMetaGrad<T> grad, int frame_size,
int batchSize, activation_mode_t active_node, int batch_size, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frame_idx >= frame_size) return;
int batchIdx = 0; int batch_idx = 0;
if (isBatch) { if (is_batch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y; batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return; if (batch_idx >= batch_size) return;
value.gateValue += batchIdx * frameSize * 4; value.gate_value += batch_idx * frame_size * 4;
value.stateValue += batchIdx * frameSize; value.state_value += batch_idx * frame_size;
value.stateActiveValue += batchIdx * frameSize; value.state_active_value += batch_idx * frame_size;
grad.gateGrad += batchIdx * frameSize * 4; grad.gate_grad += batch_idx * frame_size * 4;
grad.stateGrad += batchIdx * frameSize; grad.state_grad += batch_idx * frame_size;
grad.outputGrad += batchIdx * frameSize; grad.output_grad += batch_idx * frame_size;
} }
T rValueIn; T r_value_in;
T rValueIg; T r_value_ig;
T rValueFg; T r_value_fg;
T rValueOg; T r_value_og;
T rGradIn; T r_grad_in;
T rGradIg; T r_grad_ig;
T rGradFg; T r_grad_fg;
T rGradOg; T r_grad_og;
T rPrevState = 0; T r_prev_state = 0;
T rPrevStateGrad; T r_prev_state_grad;
T rState; T r_state;
T rStateGrad; T r_state_grad;
T rStateAtv; T r_state_atv;
T rOutputGrad; T r_output_grad;
T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0; T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0; T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0; T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
T rCheckIGrad; T r_checkIGrad;
T rCheckFGrad; T r_checkFGrad;
T rCheckOGrad; T r_checkOGrad;
rValueIn = value.gateValue[frameIdx]; r_value_in = value.gate_value[frame_idx];
rValueIg = value.gateValue[frameIdx + frameSize]; r_value_ig = value.gate_value[frame_idx + frame_size];
rValueFg = value.gateValue[frameIdx + frameSize * 2]; r_value_fg = value.gate_value[frame_idx + frame_size * 2];
rValueOg = value.gateValue[frameIdx + frameSize * 3]; r_value_og = value.gate_value[frame_idx + frame_size * 3];
rState = value.stateValue[frameIdx]; r_state = value.state_value[frame_idx];
rStateAtv = value.stateActiveValue[frameIdx]; r_state_atv = value.state_active_value[frame_idx];
rOutputGrad = grad.outputGrad[frameIdx]; r_output_grad = grad.output_grad[frame_idx];
rStateGrad = grad.stateGrad[frameIdx]; r_state_grad = grad.state_grad[frame_idx];
if (value.prevStateValue) { if (value.prev_state_value) {
if (isBatch) value.prevStateValue += batchIdx * frameSize; if (is_batch) value.prev_state_value += batch_idx * frame_size;
rPrevState = value.prevStateValue[frameIdx]; r_prev_state = value.prev_state_value[frame_idx];
} }
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg, op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad, r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad, r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
active_node, active_gate, active_state); r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
active_state);
grad.gateGrad[frameIdx] = rGradIn;
grad.gateGrad[frameIdx + frameSize] = rGradIg; grad.gate_grad[frame_idx] = r_grad_in;
grad.gateGrad[frameIdx + frameSize * 2] = rGradFg; grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
grad.gateGrad[frameIdx + frameSize * 3] = rGradOg; grad.gate_grad[frame_idx + frame_size * 2] = r_grad_fg;
grad.stateGrad[frameIdx] = rStateGrad; grad.gate_grad[frame_idx + frame_size * 3] = r_grad_og;
if (grad.prevStateGrad) { grad.state_grad[frame_idx] = r_state_grad;
if (isBatch) grad.prevStateGrad += batchIdx * frameSize; if (grad.prev_state_grad) {
grad.prevStateGrad[frameIdx] = rPrevStateGrad; if (is_batch) grad.prev_state_grad += batch_idx * frame_size;
grad.prev_state_grad[frame_idx] = r_prev_state_grad;
} }
if (isBatch) { if (is_batch) {
if (value.prevStateValue) { if (value.prev_state_value) {
if (grad.checkIgGrad) if (grad.check_ig_grad)
paddle::platform::CudaAtomicAdd(grad.checkIgGrad + frameIdx, paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx,
rCheckIGrad); r_checkIGrad);
if (grad.checkFgGrad) if (grad.check_fg_grad)
paddle::platform::CudaAtomicAdd(grad.checkFgGrad + frameIdx, paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx,
rCheckFGrad); r_checkFGrad);
} }
if (grad.checkOgGrad) if (grad.check_og_grad)
paddle::platform::CudaAtomicAdd(grad.checkOgGrad + frameIdx, rCheckOGrad); paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx,
r_checkOGrad);
} else { } else {
if (value.prevStateValue) { if (value.prev_state_value) {
if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad; if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;
if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad; if (grad.check_fg_grad) grad.check_fg_grad[frame_idx] += r_checkFGrad;
} }
if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad; if (grad.check_og_grad) grad.check_og_grad[frame_idx] += r_checkOGrad;
} }
} }
template <class T, class Op> template <class T, class Op>
void gpu_lstm_forward(const platform::DeviceContext& context, Op op, void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, int frameSize, int batchSize, LstmMetaValue<T> value, int frame_size, int batch_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
dim3 threads; dim3 threads;
dim3 grid; dim3 grid;
if (batchSize == 1) { if (batch_size == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024; int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024; int frame_blocks = (frame_size + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1); threads = dim3(frame_per_block, 1);
grid = dim3(frameBlocks, 1); grid = dim3(frame_blocks, 1);
} else { } else {
/* framePerBlock = 32 batchPerBlock = 32 */ /* frame_per_block = 32 batch_per_block = 32 */
threads = dim3(32, 32); threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
} }
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) { if (batch_size == 1) {
KeLstmForward<T, Op, KeLstmForward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, stream>>>( /* is_batch= */ false><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, active_node, active_gate, op, value, frame_size, batch_size, active_node, active_gate,
active_state); active_state);
} else { } else {
KeLstmForward<T, Op, KeLstmForward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, stream>>>( /* is_batch= */ true><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, active_node, active_gate, op, value, frame_size, batch_size, active_node, active_gate,
active_state); active_state);
} }
} }
...@@ -216,34 +219,34 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op, ...@@ -216,34 +219,34 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
template <class T, class Op> template <class T, class Op>
void gpu_lstm_backward(const platform::DeviceContext& context, Op op, void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, LstmMetaGrad<T> grad, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frameSize, int batchSize, int frame_size, int batch_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
dim3 threads; dim3 threads;
dim3 grid; dim3 grid;
if (batchSize == 1) { if (batch_size == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024; int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024; int frame_blocks = (frame_size + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1); threads = dim3(frame_per_block, 1);
grid = dim3(frameBlocks, 1); grid = dim3(frame_blocks, 1);
} else { } else {
/* framePerBlock = 32 batchPerBlock = 16 */ /* frame_per_block = 32 batch_per_block = 16 */
threads = dim3(32, 16); threads = dim3(32, 16);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 16 - 1) / 16); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
} }
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) { if (batch_size == 1) {
KeLstmBackward<T, Op, KeLstmBackward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, stream>>>( /* is_batch= */ false><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, active_node, active_gate, op, value, grad, frame_size, batch_size, active_node, active_gate,
active_state); active_state);
} else { } else {
KeLstmBackward<T, Op, KeLstmBackward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, stream>>>( /* is_batch= */ true><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, active_node, active_gate, op, value, grad, frame_size, batch_size, active_node, active_gate,
active_state); active_state);
} }
} }
......
...@@ -27,19 +27,19 @@ namespace forward { ...@@ -27,19 +27,19 @@ namespace forward {
template <class T> template <class T>
class lstm { class lstm {
public: public:
HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg, HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
T &prevState, T &state, T &stateAtv, T &output, T &prev_state, T &state, T &state_atv, T &output,
T &checkI, T &checkF, T &checkO, T &checkI, T &checkF, T &checkO,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
valueIn = activation(valueIn, active_node); value_in = activation(value_in, active_node);
valueIg = activation(valueIg + prevState * checkI, active_gate); value_ig = activation(value_ig + prev_state * checkI, active_gate);
valueFg = activation(valueFg + prevState * checkF, active_gate); value_fg = activation(value_fg + prev_state * checkF, active_gate);
state = valueIn * valueIg + prevState * valueFg; state = value_in * value_ig + prev_state * value_fg;
valueOg = activation(valueOg + state * checkO, active_gate); value_og = activation(value_og + state * checkO, active_gate);
stateAtv = activation(state, active_state); state_atv = activation(state, active_state);
output = valueOg * stateAtv; output = value_og * state_atv;
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default
...@@ -48,24 +48,27 @@ class lstm { ...@@ -48,24 +48,27 @@ class lstm {
// Only float support AVX optimization // Only float support AVX optimization
static const bool avx = std::is_same<T, float>::value; static const bool avx = std::is_same<T, float>::value;
HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg, HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig,
__m256 &valueOg, __m256 &prevState, __m256 &state, __m256 &value_fg, __m256 &value_og,
__m256 &stateAtv, __m256 &output, __m256 &checkI, __m256 &prev_state, __m256 &state,
__m256 &state_atv, __m256 &output, __m256 &checkI,
__m256 &checkF, __m256 &checkO, __m256 &checkF, __m256 &checkO,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
valueIn = activation(valueIn, active_node); value_in = activation(value_in, active_node);
valueIg = activation( value_ig =
_mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)), active_gate); activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)),
valueFg = activation(
_mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)), active_gate);
state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg),
_mm256_mul_ps(prevState, valueFg));
valueOg = activation(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)),
active_gate); active_gate);
stateAtv = activation(state, active_state); value_fg =
output = _mm256_mul_ps(valueOg, stateAtv); activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)),
active_gate);
state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig),
_mm256_mul_ps(prev_state, value_fg));
value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)),
active_gate);
state_atv = activation(state, active_state);
output = _mm256_mul_ps(value_og, state_atv);
} }
#endif #endif
#endif #endif
...@@ -78,25 +81,26 @@ namespace backward { ...@@ -78,25 +81,26 @@ namespace backward {
template <class T> template <class T>
class lstm { class lstm {
public: public:
HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg, HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
T &gradIn, T &gradIg, T &gradFg, T &gradOg, T &grad_in, T &grad_ig, T &grad_fg, T &grad_og,
T &prevState, T &prevStateGrad, T &state, T &prev_state, T &prev_state_grad, T &state,
T &stateGrad, T &stateAtv, T &outputGrad, T &state_grad, T &state_atv, T &output_grad,
T &checkI, T &checkF, T &checkO, T &checkIGrad, T &checkI, T &checkF, T &checkO, T &checkIGrad,
T &checkFGrad, T &checkOGrad, T &checkFGrad, T &checkOGrad,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
gradOg = activation(outputGrad * stateAtv, valueOg, active_gate); grad_og = activation(output_grad * state_atv, value_og, active_gate);
stateGrad += activation(outputGrad * valueOg, stateAtv, active_state) + state_grad += activation(output_grad * value_og, state_atv, active_state) +
gradOg * checkO; grad_og * checkO;
gradIn = activation(stateGrad * valueIg, valueIn, active_node); grad_in = activation(state_grad * value_ig, value_in, active_node);
gradIg = activation(stateGrad * valueIn, valueIg, active_gate); grad_ig = activation(state_grad * value_in, value_ig, active_gate);
gradFg = activation(stateGrad * prevState, valueFg, active_gate); grad_fg = activation(state_grad * prev_state, value_fg, active_gate);
prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg; prev_state_grad =
checkIGrad = gradIg * prevState; grad_ig * checkI + grad_fg * checkF + state_grad * value_fg;
checkFGrad = gradFg * prevState; checkIGrad = grad_ig * prev_state;
checkOGrad = gradOg * state; checkFGrad = grad_fg * prev_state;
checkOGrad = grad_og * state;
} }
#ifndef __NVCC__ #ifndef __NVCC__
#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default
...@@ -105,32 +109,32 @@ class lstm { ...@@ -105,32 +109,32 @@ class lstm {
// Only float support AVX optimization // Only float support AVX optimization
static const bool avx = std::is_same<T, float>::value; static const bool avx = std::is_same<T, float>::value;
HOSTDEVICE void operator()( HOSTDEVICE void operator()(
__m256 &valueIn, __m256 &valueIg, __m256 &valueFg, __m256 &valueOg, __m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og,
__m256 &gradIn, __m256 &gradIg, __m256 &gradFg, __m256 &gradOg, __m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og,
__m256 &prevState, __m256 &prevStateGrad, __m256 &state, __m256 &prev_state, __m256 &prev_state_grad, __m256 &state,
__m256 &stateGrad, __m256 &stateAtv, __m256 &outputGrad, __m256 &checkI, __m256 &state_grad, __m256 &state_atv, __m256 &output_grad,
__m256 &checkF, __m256 &checkO, __m256 &checkIGrad, __m256 &checkFGrad, __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad,
__m256 &checkOGrad, activation_mode_t active_node, __m256 &checkFGrad, __m256 &checkOGrad, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_state) { activation_mode_t active_gate, activation_mode_t active_state) {
gradOg = grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og,
activation(_mm256_mul_ps(outputGrad, stateAtv), valueOg, active_gate); active_gate);
stateGrad = _mm256_add_ps( state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og),
activation(_mm256_mul_ps(outputGrad, valueOg), stateAtv, active_state), state_atv, active_state),
stateGrad); state_grad);
stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad); state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad);
gradIn = grad_in =
activation(_mm256_mul_ps(stateGrad, valueIg), valueIn, active_node); activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node);
gradIg = grad_ig =
activation(_mm256_mul_ps(stateGrad, valueIn), valueIg, active_gate); activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate);
gradFg = grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg,
activation(_mm256_mul_ps(stateGrad, prevState), valueFg, active_gate); active_gate);
prevStateGrad = _mm256_add_ps(_mm256_mul_ps(gradIg, checkI), prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI),
_mm256_mul_ps(gradFg, checkF)); _mm256_mul_ps(grad_fg, checkF));
prevStateGrad = prev_state_grad =
_mm256_add_ps(_mm256_mul_ps(stateGrad, valueFg), prevStateGrad); _mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad);
checkIGrad = _mm256_mul_ps(gradIg, prevState); checkIGrad = _mm256_mul_ps(grad_ig, prev_state);
checkFGrad = _mm256_mul_ps(gradFg, prevState); checkFGrad = _mm256_mul_ps(grad_fg, prev_state);
checkOGrad = _mm256_mul_ps(gradOg, state); checkOGrad = _mm256_mul_ps(grad_og, state);
} }
#endif #endif
#endif #endif
......
...@@ -21,29 +21,29 @@ namespace math { ...@@ -21,29 +21,29 @@ namespace math {
template <typename T> template <typename T>
struct GRUUnitFunctor<platform::CPUPlace, T> { struct GRUUnitFunctor<platform::CPUPlace, T> {
static void compute(const platform::DeviceContext &context, static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, int frameSize, int batchSize, hl_gru_value<T> value, int frame_size, int batch_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate) { activation_mode_t active_gate) {
#ifndef __NVCC__ #ifndef __NVCC__
if (value.prevOutValue) { if (value.prev_out_value) {
math::gemm<platform::CPUPlace, T>( math::gemm<platform::CPUPlace, T>(
context, false, false, batchSize, frameSize * 2, frameSize, 1, context, false, false, batch_size, frame_size * 2, frame_size, 1,
value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1, value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
value.gateValue, frameSize * 3); 1, value.gate_value, frame_size * 3);
} }
detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value, detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
frameSize, batchSize, active_gate); frame_size, batch_size, active_gate);
if (value.prevOutValue) { if (value.prev_out_value) {
math::gemm<platform::CPUPlace, T>( math::gemm<platform::CPUPlace, T>(
context, false, false, batchSize, frameSize, frameSize, 1, context, false, false, batch_size, frame_size, frame_size, 1,
value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1, value.reset_output_value, frame_size, value.state_weight, frame_size,
value.gateValue + frameSize * 2, frameSize * 3); 1, value.gate_value + frame_size * 2, frame_size * 3);
} }
detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value, detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
frameSize, batchSize, active_node); frame_size, batch_size, active_node);
#endif #endif
} }
}; };
...@@ -51,41 +51,43 @@ struct GRUUnitFunctor<platform::CPUPlace, T> { ...@@ -51,41 +51,43 @@ struct GRUUnitFunctor<platform::CPUPlace, T> {
template <typename T> template <typename T>
struct GRUUnitGradFunctor<platform::CPUPlace, T> { struct GRUUnitGradFunctor<platform::CPUPlace, T> {
static void compute(const platform::DeviceContext &context, static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize, hl_gru_value<T> value, hl_gru_grad<T> grad,
int batchSize, activation_mode_t active_node, int frame_size, int batch_size,
activation_mode_t active_node,
activation_mode_t active_gate) { activation_mode_t active_gate) {
#ifndef __NVCC__ #ifndef __NVCC__
detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value, detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
grad, frameSize, batchSize, active_node); grad, frame_size, batch_size, active_node);
if (value.prevOutValue && grad.prevOutGrad) { if (value.prev_out_value && grad.prev_out_grad) {
math::gemm<platform::CPUPlace, T>( math::gemm<platform::CPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize, 1, context, false, true, batch_size, frame_size, frame_size, 1,
grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight, grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
frameSize, 0, grad.resetOutputGrad, frameSize); frame_size, 0, grad.reset_output_grad, frame_size);
if (grad.stateWeightGrad) { if (grad.state_weight_grad) {
math::gemm<platform::CPUPlace, T>( math::gemm<platform::CPUPlace, T>(
context, true, false, frameSize, frameSize, batchSize, 1, context, true, false, frame_size, frame_size, batch_size, 1,
value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2, value.reset_output_value, frame_size,
frameSize * 3, 1, grad.stateWeightGrad, frameSize); grad.gate_grad + frame_size * 2, frame_size * 3, 1,
grad.state_weight_grad, frame_size);
} }
} }
detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value, detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
grad, frameSize, batchSize, active_gate); grad, frame_size, batch_size, active_gate);
if (grad.prevOutGrad && value.prevOutValue) { if (grad.prev_out_grad && value.prev_out_value) {
math::gemm<platform::CPUPlace, T>( math::gemm<platform::CPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize * 2, 1, context, false, true, batch_size, frame_size, frame_size * 2, 1,
grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1, grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
grad.prevOutGrad, frameSize); grad.prev_out_grad, frame_size);
if (grad.gateWeightGrad) { if (grad.gate_weight_grad) {
math::gemm<platform::CPUPlace, T>( math::gemm<platform::CPUPlace, T>(
context, true, false, frameSize, frameSize * 2, batchSize, 1, context, true, false, frame_size, frame_size * 2, batch_size, 1,
value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1, value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
grad.gateWeightGrad, frameSize * 2); grad.gate_weight_grad, frame_size * 2);
} }
} }
#endif #endif
......
...@@ -21,66 +21,66 @@ namespace math { ...@@ -21,66 +21,66 @@ namespace math {
template <typename T> template <typename T>
struct GRUUnitFunctor<platform::GPUPlace, T> { struct GRUUnitFunctor<platform::GPUPlace, T> {
static void compute(const platform::DeviceContext &context, static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, int frameSize, int batchSize, hl_gru_value<T> value, int frame_size, int batch_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate) { activation_mode_t active_gate) {
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(context).stream(); reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
dim3 threads; dim3 threads;
dim3 grid; dim3 grid;
if (batchSize == 1) { if (batch_size == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024; int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024; int frame_blocks = (frame_size + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1); threads = dim3(frame_per_block, 1);
grid = dim3(frameBlocks, 1); grid = dim3(frame_blocks, 1);
} else { } else {
threads = dim3(32, 32); threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
} }
if (value.prevOutValue) { if (value.prev_out_value) {
math::gemm<platform::GPUPlace, T>( math::gemm<platform::GPUPlace, T>(
context, false, false, batchSize, frameSize * 2, frameSize, 1, context, false, false, batch_size, frame_size * 2, frame_size, 1,
value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1, value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
value.gateValue, frameSize * 3); 1, value.gate_value, frame_size * 3);
} }
if (batchSize == 1) { if (batch_size == 1) {
detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>, detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
/* isBatch= */ false, /* is_batch= */ false,
T><<<grid, threads, 0, stream>>>( T><<<grid, threads, 0, stream>>>(
detail::forward::gru_resetOutput<T>(), value.gateValue, detail::forward::gru_resetOutput<T>(), value.gate_value,
value.resetOutputValue, value.prevOutValue, frameSize, batchSize, value.reset_output_value, value.prev_out_value, frame_size,
active_gate); batch_size, active_gate);
} else { } else {
detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>, detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
/* isBatch= */ true, /* is_batch= */ true,
T><<<grid, threads, 0, stream>>>( T><<<grid, threads, 0, stream>>>(
detail::forward::gru_resetOutput<T>(), value.gateValue, detail::forward::gru_resetOutput<T>(), value.gate_value,
value.resetOutputValue, value.prevOutValue, frameSize, batchSize, value.reset_output_value, value.prev_out_value, frame_size,
active_gate); batch_size, active_gate);
} }
if (value.prevOutValue) { if (value.prev_out_value) {
math::gemm<platform::GPUPlace, T>( math::gemm<platform::GPUPlace, T>(
context, false, false, batchSize, frameSize, frameSize, 1, context, false, false, batch_size, frame_size, frame_size, 1,
value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1, value.reset_output_value, frame_size, value.state_weight, frame_size,
value.gateValue + frameSize * 2, frameSize * 3); 1, value.gate_value + frame_size * 2, frame_size * 3);
} }
if (batchSize == 1) { if (batch_size == 1) {
detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>, detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
/* isBatch= */ false, /* is_batch= */ false,
T><<<grid, threads, 0, stream>>>( T><<<grid, threads, 0, stream>>>(
detail::forward::gru_finalOutput<T>(), value.gateValue, detail::forward::gru_finalOutput<T>(), value.gate_value,
value.prevOutValue, value.outputValue, frameSize, batchSize, value.prev_out_value, value.output_value, frame_size, batch_size,
active_node); active_node);
} else { } else {
detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>, detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
/* isBatch= */ true, /* is_batch= */ true,
T><<<grid, threads, 0, stream>>>( T><<<grid, threads, 0, stream>>>(
detail::forward::gru_finalOutput<T>(), value.gateValue, detail::forward::gru_finalOutput<T>(), value.gate_value,
value.prevOutValue, value.outputValue, frameSize, batchSize, value.prev_out_value, value.output_value, frame_size, batch_size,
active_node); active_node);
} }
} }
...@@ -89,80 +89,82 @@ struct GRUUnitFunctor<platform::GPUPlace, T> { ...@@ -89,80 +89,82 @@ struct GRUUnitFunctor<platform::GPUPlace, T> {
template <typename T> template <typename T>
struct GRUUnitGradFunctor<platform::GPUPlace, T> { struct GRUUnitGradFunctor<platform::GPUPlace, T> {
static void compute(const platform::DeviceContext &context, static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize, hl_gru_value<T> value, hl_gru_grad<T> grad,
int batchSize, activation_mode_t active_node, int frame_size, int batch_size,
activation_mode_t active_node,
activation_mode_t active_gate) { activation_mode_t active_gate) {
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext &>(context).stream(); reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
dim3 threads; dim3 threads;
dim3 grid; dim3 grid;
if (batchSize == 1) { if (batch_size == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024; int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024; int frame_blocks = (frame_size + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1); threads = dim3(frame_per_block, 1);
grid = dim3(frameBlocks, 1); grid = dim3(frame_blocks, 1);
} else { } else {
threads = dim3(32, 32); threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
} }
if (batchSize == 1) { if (batch_size == 1) {
detail::KeGruBackwardStateGrad< detail::KeGruBackwardStateGrad<
detail::backward::gru_stateGrad<T>, detail::backward::gru_stateGrad<T>,
/* isBatch= */ false><<<grid, threads, 0, stream>>>( /* is_batch= */ false><<<grid, threads, 0, stream>>>(
detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad, detail::backward::gru_stateGrad<T>(), value.gate_value,
value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize, grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
batchSize, active_node); grad.output_grad, frame_size, batch_size, active_node);
} else { } else {
detail::KeGruBackwardStateGrad< detail::KeGruBackwardStateGrad<
detail::backward::gru_stateGrad<T>, detail::backward::gru_stateGrad<T>,
/* isBatch= */ true><<<grid, threads, 0, stream>>>( /* is_batch= */ true><<<grid, threads, 0, stream>>>(
detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad, detail::backward::gru_stateGrad<T>(), value.gate_value,
value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize, grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
batchSize, active_node); grad.output_grad, frame_size, batch_size, active_node);
} }
if (value.prevOutValue && grad.prevOutGrad) { if (value.prev_out_value && grad.prev_out_grad) {
math::gemm<platform::GPUPlace, T>( math::gemm<platform::GPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize, 1, context, false, true, batch_size, frame_size, frame_size, 1,
grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight, grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
frameSize, 0, grad.resetOutputGrad, frameSize); frame_size, 0, grad.reset_output_grad, frame_size);
if (grad.stateWeightGrad) { if (grad.state_weight_grad) {
math::gemm<platform::GPUPlace, T>( math::gemm<platform::GPUPlace, T>(
context, true, false, frameSize, frameSize, batchSize, 1, context, true, false, frame_size, frame_size, batch_size, 1,
value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2, value.reset_output_value, frame_size,
frameSize * 3, 1, grad.stateWeightGrad, frameSize); grad.gate_grad + frame_size * 2, frame_size * 3, 1,
grad.state_weight_grad, frame_size);
} }
} }
if (batchSize == 1) { if (batch_size == 1) {
detail::KeGruBackwardResetGrad< detail::KeGruBackwardResetGrad<
detail::backward::gru_resetGrad<T>, detail::backward::gru_resetGrad<T>,
/* isBatch= */ false><<<grid, threads, 0, stream>>>( /* is_batch= */ false><<<grid, threads, 0, stream>>>(
detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad, detail::backward::gru_resetGrad<T>(), value.gate_value,
value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize, grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
batchSize, active_gate); grad.reset_output_grad, frame_size, batch_size, active_gate);
} else { } else {
detail::KeGruBackwardResetGrad< detail::KeGruBackwardResetGrad<
detail::backward::gru_resetGrad<T>, detail::backward::gru_resetGrad<T>,
/* isBatch= */ true><<<grid, threads, 0, stream>>>( /* is_batch= */ true><<<grid, threads, 0, stream>>>(
detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad, detail::backward::gru_resetGrad<T>(), value.gate_value,
value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize, grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
batchSize, active_gate); grad.reset_output_grad, frame_size, batch_size, active_gate);
} }
if (grad.prevOutGrad && value.prevOutValue) { if (grad.prev_out_grad && value.prev_out_value) {
math::gemm<platform::GPUPlace, T>( math::gemm<platform::GPUPlace, T>(
context, false, true, batchSize, frameSize, frameSize * 2, 1, context, false, true, batch_size, frame_size, frame_size * 2, 1,
grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1, grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
grad.prevOutGrad, frameSize); grad.prev_out_grad, frame_size);
if (grad.gateWeightGrad) { if (grad.gate_weight_grad) {
math::gemm<platform::GPUPlace, T>( math::gemm<platform::GPUPlace, T>(
context, true, false, frameSize, frameSize * 2, batchSize, 1, context, true, false, frame_size, frame_size * 2, batch_size, 1,
value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1, value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
grad.gateWeightGrad, frameSize * 2); grad.gate_weight_grad, frame_size * 2);
} }
} }
} }
......
...@@ -22,28 +22,28 @@ namespace math { ...@@ -22,28 +22,28 @@ namespace math {
// TODO(guosheng): refine code style in gru_compute // TODO(guosheng): refine code style in gru_compute
template <typename T> template <typename T>
struct hl_gru_value { struct hl_gru_value {
T *gateWeight; T *gate_weight;
T *stateWeight; T *state_weight;
T *gateValue; T *gate_value;
T *resetOutputValue; T *reset_output_value;
T *outputValue; T *output_value;
T *prevOutValue; T *prev_out_value;
}; };
template <typename T> template <typename T>
struct hl_gru_grad { struct hl_gru_grad {
T *gateWeightGrad; T *gate_weight_grad;
T *stateWeightGrad; T *state_weight_grad;
T *gateGrad; T *gate_grad;
T *resetOutputGrad; T *reset_output_grad;
T *outputGrad; T *output_grad;
T *prevOutGrad; T *prev_out_grad;
}; };
template <typename Place, typename T> template <typename Place, typename T>
struct GRUUnitFunctor { struct GRUUnitFunctor {
static void compute(const platform::DeviceContext &context, static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, int frameSize, int batchSize, hl_gru_value<T> value, int frame_size, int batch_size,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate); activation_mode_t active_gate);
}; };
...@@ -51,8 +51,9 @@ struct GRUUnitFunctor { ...@@ -51,8 +51,9 @@ struct GRUUnitFunctor {
template <typename Place, typename T> template <typename Place, typename T>
struct GRUUnitGradFunctor { struct GRUUnitGradFunctor {
static void compute(const platform::DeviceContext &context, static void compute(const platform::DeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize, hl_gru_value<T> value, hl_gru_grad<T> grad,
int batchSize, activation_mode_t active_node, int frame_size, int batch_size,
activation_mode_t active_node,
activation_mode_t active_gate); activation_mode_t active_gate);
}; };
......
...@@ -30,12 +30,12 @@ struct LstmUnitFunctor<platform::CPUPlace, T> { ...@@ -30,12 +30,12 @@ struct LstmUnitFunctor<platform::CPUPlace, T> {
detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size, detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
ActiveType(cand_act), ActiveType(gate_act), ActiveType(cand_act), ActiveType(gate_act),
ActiveType(cell_act)); ActiveType(cell_act));
value.gateValue += frame_size * 4; value.gate_value += frame_size * 4;
value.stateValue += frame_size; value.state_value += frame_size;
value.stateActiveValue += frame_size; value.state_active_value += frame_size;
value.outputValue += frame_size; value.output_value += frame_size;
if (value.prevStateValue) { if (value.prev_state_value) {
value.prevStateValue += frame_size; value.prev_state_value += frame_size;
} }
} }
} }
...@@ -53,20 +53,20 @@ struct LstmUnitGradFunctor<platform::CPUPlace, T> { ...@@ -53,20 +53,20 @@ struct LstmUnitGradFunctor<platform::CPUPlace, T> {
frame_size, ActiveType(cand_act), frame_size, ActiveType(cand_act),
ActiveType(gate_act), ActiveType(cell_act)); ActiveType(gate_act), ActiveType(cell_act));
value.gateValue += frame_size * 4; value.gate_value += frame_size * 4;
value.stateValue += frame_size; value.state_value += frame_size;
value.stateActiveValue += frame_size; value.state_active_value += frame_size;
value.outputValue += frame_size; value.output_value += frame_size;
if (value.prevStateValue) { if (value.prev_state_value) {
value.prevStateValue += frame_size; value.prev_state_value += frame_size;
} }
grad.gateGrad += frame_size * 4; grad.gate_grad += frame_size * 4;
grad.stateGrad += frame_size; grad.state_grad += frame_size;
grad.stateActiveGrad += frame_size; grad.state_active_grad += frame_size;
grad.outputGrad += frame_size; grad.output_grad += frame_size;
if (grad.prevStateGrad) { if (grad.prev_state_grad) {
grad.prevStateGrad += frame_size; grad.prev_state_grad += frame_size;
} }
} }
} }
......
...@@ -31,26 +31,26 @@ typedef enum { ...@@ -31,26 +31,26 @@ typedef enum {
template <class T> template <class T>
struct LstmMetaValue { struct LstmMetaValue {
T *gateValue; T *gate_value;
T *prevStateValue; T *prev_state_value;
T *stateValue; T *state_value;
T *stateActiveValue; T *state_active_value;
T *outputValue; T *output_value;
T *checkIg; T *check_ig;
T *checkFg; T *check_fg;
T *checkOg; T *check_og;
}; };
template <class T> template <class T>
struct LstmMetaGrad { struct LstmMetaGrad {
T *gateGrad; T *gate_grad;
T *prevStateGrad; T *prev_state_grad;
T *stateGrad; T *state_grad;
T *stateActiveGrad; T *state_active_grad;
T *outputGrad; T *output_grad;
T *checkIgGrad; T *check_ig_grad;
T *checkFgGrad; T *check_fg_grad;
T *checkOgGrad; T *check_og_grad;
}; };
inline activation_mode_t ActiveType(const std::string &type) { inline activation_mode_t ActiveType(const std::string &type) {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/unpooling.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
class Unpool2dMaxFunctor<platform::CPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& indices, framework::Tensor* output) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output->dims()[1];
const int output_height = output->dims()[2];
const int output_width = output->dims()[3];
int input_feasize = input_height * input_width;
int output_feasize = output_height * output_width;
const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>();
T* output_data = output->mutable_data<T>(context.GetPlace());
for (int b = 0; b < batch_size; ++b) {
for (int c = 0; c < output_channels; ++c) {
for (int i = 0; i < input_feasize; ++i) {
int index = indices_data[i];
PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
output_data[index] = input_data[i];
}
input_data += input_feasize;
indices_data += input_feasize;
output_data += output_feasize;
}
}
}
};
template <class T>
class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& indices,
const framework::Tensor& output,
const framework::Tensor& output_grad,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output.dims()[1];
const int output_height = output.dims()[2];
const int output_width = output.dims()[3];
int input_feasize = input_height * input_width;
int output_feasize = output_height * output_width;
const int* indices_data = indices.data<int>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
for (int b = 0; b < batch_size; ++b) {
for (int c = 0; c < output_channels; ++c) {
for (int i = 0; i < input_feasize; ++i) {
int index = indices_data[i];
PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
input_grad_data[i] = output_grad_data[index];
}
input_grad_data += input_feasize;
indices_data += input_feasize;
output_grad_data += output_feasize;
}
}
}
};
template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/unpooling.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
const int* indices_data,
const int input_height, const int input_width,
const int channels, T* output_data,
const int output_height,
const int output_width) {
int in_n_stride = input_height * input_width * channels;
int in_c_stride = input_height * input_width;
int out_n_stride = output_height * output_width * channels;
int out_c_stride = output_height * output_width;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (int i = index; i < nthreads; i += offset) {
int bidx = i / in_n_stride;
int boffset = i % in_n_stride;
int cidx = boffset / in_c_stride;
int out_offset = bidx * out_n_stride + cidx * out_c_stride;
int out_index = indices_data[i];
PADDLE_ASSERT(out_index < out_c_stride);
output_data[out_offset + out_index] = input_data[i];
}
}
template <typename T>
__global__ void KernelUnpool2dMaxGrad(
const int nthreads, const T* input_data, const int* indices_data,
const int input_height, const int input_width, const int channels,
const T* output_data, const T* output_grad, const int output_height,
const int output_width, T* input_grad) {
int in_n_stride = input_height * input_width * channels;
int in_c_stride = input_height * input_width;
int out_n_stride = output_height * output_width * channels;
int out_c_stride = output_height * output_width;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (int i = index; i < nthreads; i += offset) {
int bidx = i / in_n_stride;
int boffset = i % in_n_stride;
int cidx = boffset / in_c_stride;
int out_offset = bidx * out_n_stride + cidx * out_c_stride;
int out_index = indices_data[i];
PADDLE_ASSERT(out_index < out_c_stride);
input_grad[i] = output_grad[out_offset + out_index];
}
}
/*
* All tensors are in NCHW format.
*/
template <typename T>
class Unpool2dMaxFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& indices, framework::Tensor* output) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output->dims()[1];
const int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>();
T* output_data = output->mutable_data<T>(context.GetPlace());
int threads = 1024;
int grid = (input.numel() + threads - 1) / threads;
KernelUnpool2dMax<
T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(input.numel(), input_data, indices_data,
input_height, input_width, output_channels,
output_data, output_height, output_width);
}
};
/*
* All tensors are in NCHW format.
*/
template <typename T>
class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& indices,
const framework::Tensor& output,
const framework::Tensor& output_grad,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output.dims()[1];
const int output_height = output.dims()[2];
const int output_width = output.dims()[3];
const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int threads = 1024;
int grid = (input.numel() + threads - 1) / threads;
KernelUnpool2dMaxGrad<
T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(input.numel(), input_data, indices_data,
input_height, input_width, output_channels,
output_data, output_grad_data, output_height,
output_width, input_grad_data);
}
};
template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/tensor.h"
namespace paddle {
namespace operators {
namespace math {
template <typename Place, typename T>
class Unpool2dMaxFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& indices, framework::Tensor* output);
};
template <typename Place, class T>
class Unpool2dMaxGradFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& indices,
const framework::Tensor& output,
const framework::Tensor& output_grad,
framework::Tensor* input_grad);
};
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -99,13 +99,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel { ...@@ -99,13 +99,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
"Output(X@Grad) should not be null."); "Output(X@Grad) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null."); "Input(Out@GRAD) should not be null.");
std::vector<framework::DDim> d_ins; ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
auto ins = ctx->GetInputsDim("X");
// No need to compute gradient for Input(Ids)
for (size_t i = 0; i < ins.size(); i++) {
d_ins.push_back(ins[i]);
}
ctx->SetOutputsDim(framework::GradVarName("X"), d_ins);
} }
protected: protected:
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/nce_op.h"
namespace paddle {
namespace operators {
using framework::Tensor;
class NCEOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"));
PADDLE_ENFORCE(ctx->HasInput("Label"));
PADDLE_ENFORCE(ctx->HasInput("Weight"));
PADDLE_ENFORCE(ctx->HasOutput("Cost"));
PADDLE_ENFORCE(ctx->HasOutput("SampleLogits"));
PADDLE_ENFORCE(ctx->HasOutput("SampleLabels"));
auto x_dims = ctx->GetInputDim("Input");
auto label_dims = ctx->GetInputDim("Label");
PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
if (ctx->HasInput("Bias")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0],
ctx->GetInputDim("Bias")[0]);
}
auto num_neg_samples = ctx->Attrs().Get<int>("num_neg_samples");
auto num_total_classes = ctx->Attrs().Get<int>("num_total_classes");
std::vector<int> custom_neg_classes =
ctx->Attrs().Get<std::vector<int>>("custom_neg_classes");
PADDLE_ENFORCE_EQ(num_total_classes, ctx->GetInputDim("Weight")[0]);
if (custom_neg_classes.size() > 0) {
PADDLE_ENFORCE_EQ(custom_neg_classes.size(),
static_cast<size_t>(num_neg_samples));
}
// set dims of output(Out)
std::vector<int64_t> out_dims;
out_dims.push_back(x_dims[0]);
out_dims.push_back(1);
ctx->SetOutputDim("Cost", framework::make_ddim(out_dims));
// set dims of output(SampleOut)
std::vector<int64_t> sample_out_dims;
sample_out_dims.push_back(x_dims[0]);
sample_out_dims.push_back(num_neg_samples + num_true_classes);
ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims));
ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims));
}
protected:
framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
ctx.device_context());
}
};
class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
public:
NCEOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
AddInput(
"Label",
"(Tensor) A tensor of shape [batch_size, num_true_class]. "
"'num_true_class' is the number of target classes in each sample."
"The number of target classes per sample should be same. "
"If you have a variable number of target classes, "
"you can pad them out to a constant number by either repeating them"
" or by padding with an otherwise unused class.)");
AddInput("Weight",
"(Tensor) A tensor of shape [num_class, dim]. 'num_class' is the "
"total number of class.");
AddInput(
"Bias",
"(Tensor) A tensor of shape [num_class, 1]. 'num_class' is the total "
"number of class. It is a dispensable input.")
.AsDispensable();
AddInput("SampleWeight",
"(Tensor) A tensor of shape [batch_size, 1] storing a weight for "
"each sample. And it is a dispensable input. The default value of "
"sample is 1.")
.AsDispensable();
AddOutput("Cost",
"(Tensor) A tensor of shape [batch_size, 1]. Cost of samples.");
AddOutput("SampleLogits",
"An intermediate tensor of shape[batch_size, num_neg_samples + "
"num_pos_samples]."
"This tensor is output of forward kernel and used in backward "
"kernel to compute grads."
"Given X is the dot product of input tensor and sampled labels' "
"weights."
"Then 'SampleLogits' is sigmoid(X).")
.AsIntermediate();
AddOutput("SampleLabels",
"An intermediate tensor of shape[batch_size, num_neg_samples + "
"num_pos_samples]."
"This tensor is output of forward kernel and used in backward "
"kernel to compute grads."
"")
.AsIntermediate();
AddAttr<int>("num_total_classes",
"Total number of classes in all samples.");
AddAttr<int>("num_neg_samples",
"The number of negative classes. The default value is 10.")
.SetDefault(10);
AddAttr<std::vector<int>>("custom_neg_classes",
"This attribute only be used in unitest. Classes "
"in this list wiil be used as negative classes "
"for every samples. Under normal conditions, "
"user should avoid setting this attribute.");
AddComment(R"DOC(
Compute and return the noise-contrastive estimation training loss.
See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
By default this operator uses a uniform distribution for sampling.
)DOC");
}
};
class NCEOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"));
PADDLE_ENFORCE(ctx->HasInput("Weight"));
PADDLE_ENFORCE(ctx->HasInput("Cost"));
PADDLE_ENFORCE(ctx->HasInput("SampleLogits"));
PADDLE_ENFORCE(ctx->HasInput("SampleLabels"));
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cost")),
"The input(Out@GRAD) should not be null.");
auto x_dims = ctx->GetInputDim("Input");
auto x_grad_name = framework::GradVarName("Input");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
}
auto w_dims = ctx->GetInputDim("Weight");
auto w_grad_name = framework::GradVarName("Weight");
if (ctx->HasOutput(w_grad_name)) {
ctx->SetOutputDim(w_grad_name, w_dims);
}
auto bias_grad_name = framework::GradVarName("Bias");
if (ctx->HasOutput(bias_grad_name)) {
auto bias_dims = ctx->GetInputDim("Bias");
ctx->SetOutputDim(bias_grad_name, bias_dims);
}
}
protected:
framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
ctx.device_context());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
ops::NCEKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(nce_grad,
ops::NCEGradKernel<paddle::platform::CPUPlace, float>,
ops::NCEGradKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <math.h>
#include <random>
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename Place, typename T>
void PrepareSamples(const framework::ExecutionContext& context) {
auto label = context.Input<Tensor>("Label");
const int64_t* label_data = label->data<int64_t>();
auto label_dims = label->dims();
int num_total_classes = context.Attr<int>("num_total_classes");
// for unitest
std::vector<int> custom_neg_classes =
context.Attr<std::vector<int>>("custom_neg_classes");
// random machine
std::random_device rd;
std::mt19937 rng(rd());
std::uniform_int_distribution<int> rand(0, num_total_classes - 1);
auto sample_labels = context.Output<Tensor>("SampleLabels");
auto sample_labels_dims = sample_labels->dims();
int64_t* sample_labels_data =
sample_labels->mutable_data<int64_t>(context.GetPlace());
int num_label = label_dims.size() == 2 ? label_dims[1] : 1;
int index = 0;
for (size_t i = 0; i < label_dims[0]; ++i) {
int j = 0;
for (; j < num_label; ++j) {
sample_labels_data[index++] = label_data[i * num_label + j];
}
if (custom_neg_classes.size() > 0) {
for (auto label : custom_neg_classes) {
sample_labels_data[index++] = label;
}
} else {
for (; j < sample_labels_dims[1]; ++j) {
// TODO(wanghaoshuang): support more distribution sampling
sample_labels_data[index++] = rand(rng);
}
}
}
}
template <typename Place, typename T>
class NCEKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PrepareSamples<Place, T>(context);
auto sample_labels = context.Output<Tensor>("SampleLabels");
const int64_t* sample_labels_data = sample_labels->data<int64_t>();
auto sample_out = context.Output<Tensor>("SampleLogits");
T* sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
auto label = context.Input<Tensor>("Label");
auto sample_weight = context.Input<Tensor>("SampleWeight");
const T* sample_weight_data = nullptr;
if (sample_weight != nullptr) {
sample_weight_data = sample_weight->data<T>();
}
auto out = context.Output<Tensor>("Cost");
T* out_data = out->mutable_data<T>(context.GetPlace());
int num_neg_samples = context.Attr<int>("num_neg_samples");
int num_total_classes = context.Attr<int>("num_total_classes");
int num_true_class = 1;
if (label != nullptr) {
num_true_class = label->dims()[1];
}
T b = 1. / num_total_classes * num_neg_samples;
// forward bias
auto bias = context.Input<Tensor>("Bias");
if (bias != nullptr) {
const T* bias_data = bias->data<T>();
for (size_t i = 0; i < sample_labels->numel(); ++i) {
sample_out_data[i] = bias_data[sample_labels_data[i]];
}
} else {
for (size_t i = 0; i < sample_labels->numel(); ++i) {
sample_out_data[i] = 0;
}
}
// forward mul
auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
for (size_t i = 0; i < sample_labels->numel(); ++i) {
Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
(input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
weight_mat.chip(sample_labels_data[i], 0))
.sum();
sample_out_data[i] += result(0);
sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
}
// forward cost
for (size_t i = 0; i < sample_labels->dims()[0]; ++i) {
size_t j = 0;
out_data[i] = 0;
T w = sample_weight == nullptr ? 1. : sample_weight_data[i];
// for true classes
for (; j < num_true_class; ++j) {
T o = sample_out_data[i * sample_out->dims()[1] + j];
T cost = -log(o / (o + b));
out_data[i] += w * cost;
}
// for sampled neg classes
for (; j < sample_labels->dims()[1]; ++j) {
T o = sample_out_data[i * sample_out->dims()[1] + j];
T cost = -log(b / (o + b));
out_data[i] += w * cost;
}
}
}
};
template <typename Place, typename T>
class NCEGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto d_out = context.Input<Tensor>(framework::GradVarName("Cost"));
const T* d_out_data = d_out->data<T>();
auto label = context.Input<Tensor>("Label");
auto sample_out = context.Input<Tensor>("SampleLogits");
const T* sample_out_data = sample_out->data<T>();
auto sample_labels = context.Input<Tensor>("SampleLabels");
const int64_t* sample_labels_data = sample_labels->data<int64_t>();
auto sample_weight = context.Input<Tensor>("SampleWeight");
const T* sample_weight_data = nullptr;
if (sample_weight != nullptr) {
sample_weight_data = sample_weight->data<T>();
}
int num_neg_samples = context.Attr<int>("num_neg_samples");
int num_total_classes = context.Attr<int>("num_total_classes");
int num_true_class = 1;
if (label != nullptr) {
num_true_class = label->dims()[1];
}
T b = 1. / num_total_classes * num_neg_samples;
Tensor sample_grad; // tmp tensor
T* sample_grad_data =
sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
// backward cost
for (size_t i = 0; i < sample_labels->numel(); ++i) {
T o = sample_out_data[i];
T w = sample_weight == nullptr
? 1
: sample_weight_data[i / sample_labels->dims()[1]];
sample_grad_data[i] = (i % sample_labels->dims()[1]) < num_true_class
? w * (b / (o + b)) * (o - 1)
: w * (o * (1 - o) / (o + b));
sample_grad_data[i] *= d_out_data[i / sample_labels->dims()[1]];
}
// get d_bias
auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
if (d_bias != nullptr) {
T* d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
for (size_t i = 0; i < sample_labels->numel(); ++i) {
d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
}
}
// get d_w
auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
if (d_w != nullptr) {
auto d_w_data = d_w->mutable_data<T>(context.GetPlace());
std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
auto d_w_matrix = EigenMatrix<T>::From(*d_w);
auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
for (size_t i = 0; i < sample_labels->numel(); ++i) {
d_w_matrix.chip(sample_labels_data[i], 0) +=
x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
sample_grad_data[i];
}
}
// get d_x
auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
if (d_x != nullptr) {
d_x->mutable_data<T>(context.GetPlace());
auto d_x_matrix = EigenMatrix<T>::From(*d_x);
auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
for (size_t i = 0; i < sample_labels->numel(); ++i) {
d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -105,7 +105,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto, ...@@ -105,7 +105,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
// TypedAttrChecker don't support vector type.) // TypedAttrChecker don't support vector type.)
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"paddings", "paddings",
"(vector<int>, defalut {0,0}), paddings(height, width) of pooling " "(vector<int>, default {0,0}), paddings(height, width) of pooling "
"operator." "operator."
"If global_pooling = true, paddings and ksize will be ignored.") "If global_pooling = true, paddings and ksize will be ignored.")
.SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently,
...@@ -127,10 +127,10 @@ Example: ...@@ -127,10 +127,10 @@ Example:
X shape: $(N, C, H_{in}, W_{in})$ X shape: $(N, C, H_{in}, W_{in})$
Output: Output:
Out shape: $(N, C, H_{out}, W_{out})$ Out shape: $(N, C, H_{out}, W_{out})$
where Where
$$ $$
H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\ H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
$$ $$
)DOC"); )DOC");
...@@ -177,7 +177,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto, ...@@ -177,7 +177,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
// TypedAttrChecker don't support vector type.) // TypedAttrChecker don't support vector type.)
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"paddings", "paddings",
"(vector<int>, defalut {0,0,0}), paddings(depth, height, " "(vector<int>, default {0,0,0}), paddings(depth, height, "
"width) of pooling operator. " "width) of pooling operator. "
"If global_pooling = true, ksize and paddings will be ignored.") "If global_pooling = true, ksize and paddings will be ignored.")
.SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently,
...@@ -199,11 +199,11 @@ Example: ...@@ -199,11 +199,11 @@ Example:
X shape: $(N, C, D_{in}, H_{in}, W_{in})$ X shape: $(N, C, D_{in}, H_{in}, W_{in})$
Output: Output:
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
where Where
$$ $$
D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\ D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\ H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1 W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$ $$
)DOC"); )DOC");
......
...@@ -142,7 +142,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -142,7 +142,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
// TypedAttrChecker don't support vector type.) // TypedAttrChecker don't support vector type.)
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"paddings", "paddings",
"(vector<int>, defalut:{0, 0}), paddings(height, width) of pooling " "(vector<int>, default:{0, 0}), paddings(height, width) of pooling "
"operator. " "operator. "
"If global_pooling = true, paddings and will be ignored.") "If global_pooling = true, paddings and will be ignored.")
.SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently,
...@@ -166,10 +166,10 @@ Example: ...@@ -166,10 +166,10 @@ Example:
Output: Output:
Out shape: $(N, C, H_{out}, W_{out})$ Out shape: $(N, C, H_{out}, W_{out})$
Mask shape: $(N, C, H_{out}, W_{out})$ Mask shape: $(N, C, H_{out}, W_{out})$
where Where
$$ $$
H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\ H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
$$ $$
)DOC"); )DOC");
...@@ -220,7 +220,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -220,7 +220,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
// TypedAttrChecker don't support vector type.) // TypedAttrChecker don't support vector type.)
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"paddings", "paddings",
"(vector, defalut {0,0,0}), paddings(depth, " "(vector, default {0,0,0}), paddings(depth, "
"height, width) of pooling operator. " "height, width) of pooling operator. "
"If global_pooling = true, paddings and ksize will be ignored.") "If global_pooling = true, paddings and ksize will be ignored.")
.SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently,
...@@ -244,11 +244,11 @@ Example: ...@@ -244,11 +244,11 @@ Example:
Output: Output:
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$ Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
where Where
$$ $$
D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\ D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\ H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1 W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$ $$
)DOC"); )DOC");
......
...@@ -35,9 +35,10 @@ class RankLossOp : public framework::OperatorWithKernel { ...@@ -35,9 +35,10 @@ class RankLossOp : public framework::OperatorWithKernel {
auto right_dims = ctx->GetInputDim("Right"); auto right_dims = ctx->GetInputDim("Right");
PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims), PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
"All inputs must have the same size"); "All inputs must have the same size.");
PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1), PADDLE_ENFORCE(
"All inputs must be row vector with size batch_size x 1."); (label_dims.size() == 2) && (label_dims[1] == 1),
"All inputs must be 2-D tensors with shape [batch_size x 1].");
ctx->SetOutputDim("Out", label_dims); ctx->SetOutputDim("Out", label_dims);
} }
}; };
...@@ -48,10 +49,17 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -48,10 +49,17 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker *op_checker) framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Label", AddInput("Label",
"The label indicating A ranked higher than B or not, row vector."); "(2-D Tensor with shape [batch_size x 1]) "
AddInput("Left", "The output of RankNet for doc A, vector."); "The label indicating A ranked higher than B or not.");
AddInput("Right", "The output of RankNet for doc B, vetor."); AddInput("Left",
AddOutput("Out", "The output loss of RankLoss operator, vector."); "(2-D Tensor with shape [batch_size x 1]) "
"The output of RankNet for doc A.");
AddInput("Right",
"(2-D Tensor with shape [batch_size x 1]) "
"The output of RankNet for doc B.");
AddOutput("Out",
"(2-D Tensor with shape [batch_size x 1]) "
"The output loss of RankLoss operator.");
AddComment(R"DOC( AddComment(R"DOC(
RankLoss Operator. RankLoss Operator.
...@@ -65,16 +73,17 @@ P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of ...@@ -65,16 +73,17 @@ P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
the input pair. the input pair.
The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
(P_{i,j}), which represent the output of RankNet for the two docs and the label, (P_{i,j}), which represent the output score of RankNet for the two docs and
respectively, and yields the rank loss C_{i,j} using the following equation: the label respectively, and yields the rank loss C_{i,j} using the following
equation:
\f$$ $$
C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\ C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
o_{i,j} = o_i - o_j \\ o_{i,j} = o_i - o_j \\
\tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \} \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
\f$$ $$
The operator can take inputs of one sample or in batch. The operator can take batch inputs with size batch_size (batch_size >= 1).
)DOC"); )DOC");
} }
......
...@@ -599,7 +599,9 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase { ...@@ -599,7 +599,9 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
std::vector<std::string> output{kOutputs}; std::vector<std::string> output{kOutputs};
for (auto &s : input) { for (auto &s : input) {
PADDLE_ENFORCE(ctx->HasInputs(s)); PADDLE_ENFORCE(ctx->HasInputs(s));
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s))); PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
"Cannot find the gradient variable %s",
framework::GradVarName(s));
} }
for (auto &s : output) { for (auto &s : output) {
PADDLE_ENFORCE(ctx->HasInputs(s)); PADDLE_ENFORCE(ctx->HasInputs(s));
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
...@@ -38,8 +37,8 @@ class ReshapeOp : public framework::OperatorWithKernel { ...@@ -38,8 +37,8 @@ class ReshapeOp : public framework::OperatorWithKernel {
// TODO(qiao) change batch_size // TODO(qiao) change batch_size
for (size_t i = 1; i < shape.size(); ++i) { for (size_t i = 1; i < shape.size(); ++i) {
PADDLE_ENFORCE(shape[i] > 0, PADDLE_ENFORCE(shape[i] > 0,
"Each dimension of shape " "Each dimension of Attr(shape) "
"must be positiv except the first."); "must be positive except the first one.");
} }
if (shape[0] < 0) { if (shape[0] < 0) {
shape[0] = x_dims[0]; shape[0] = x_dims[0];
......
...@@ -77,4 +77,6 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, ...@@ -77,4 +77,6 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
ops::ScaleGradMaker); ops::ScaleGradMaker);
REGISTER_OP_CPU_KERNEL(scale, REGISTER_OP_CPU_KERNEL(scale,
ops::ScaleKernel<paddle::platform::CPUPlace, float>, ops::ScaleKernel<paddle::platform::CPUPlace, float>,
ops::ScaleKernel<paddle::platform::CPUPlace, double>); ops::ScaleKernel<paddle::platform::CPUPlace, double>,
ops::ScaleKernel<paddle::platform::CPUPlace, int>,
ops::ScaleKernel<paddle::platform::CPUPlace, int64_t>);
...@@ -16,4 +16,6 @@ ...@@ -16,4 +16,6 @@
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>, scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>); paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>,
paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int>,
paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int64_t>);
...@@ -104,6 +104,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel { ...@@ -104,6 +104,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch."); PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
} }
ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("X", framework::GradVarName("X"));
} }
protected: protected:
......
...@@ -54,10 +54,10 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> { ...@@ -54,10 +54,10 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
n, static_cast<size_t>(length->dims()[0]), n, static_cast<size_t>(length->dims()[0]),
"The size of input-sequence and length-array should be the same") "The size of input-sequence and length-array should be the same");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
n, static_cast<size_t>(offset->dims()[0]), n, static_cast<size_t>(offset->dims()[0]),
"The size of input-sequence and offset-array should be the same") "The size of input-sequence and offset-array should be the same");
const int64_t* offset_data = offset->data<int64_t>(); const int64_t* offset_data = offset->data<int64_t>();
const int64_t* length_data = length->data<int64_t>(); const int64_t* length_data = length->data<int64_t>();
...@@ -78,11 +78,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> { ...@@ -78,11 +78,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < n; ++i) { for (size_t i = 0; i < n; ++i) {
PADDLE_ENFORCE_LT(0, offset_data[i], PADDLE_ENFORCE_LT(0, offset_data[i],
"The offset[%d] must greater than zero.", i) "The offset[%d] must greater than zero.", i);
PADDLE_ENFORCE_LT(0, length_data[i], PADDLE_ENFORCE_LT(0, length_data[i],
"The length[%d] must greater than zero.", i) "The length[%d] must greater than zero.", i);
PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i], PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
lod[0][i + 1], "The target tensor's length overflow.") lod[0][i + 1], "The target tensor's length overflow.");
} }
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
......
...@@ -25,20 +25,19 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { ...@@ -25,20 +25,19 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Labels"), PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
"Input(Labels) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto labels_dims = ctx->GetInputDim("Labels"); auto labels_dims = ctx->GetInputDim("Label");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(labels_dims.size(), 2, PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
"Input(Labels)'s rank should be 2."); "Input(Label)'s rank should be 2.");
PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0], PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
"The 1st dimension of Input(X) and Input(Labels) should " "The 1st dimension of Input(X) and Input(Label) should "
"be equal."); "be equal.");
PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1], PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
"The 2nd dimension of Input(X) and Input(Labels) should " "The 2nd dimension of Input(X) and Input(Label) should "
"be equal."); "be equal.");
ctx->SetOutputDim("Out", x_dims); ctx->SetOutputDim("Out", x_dims);
...@@ -53,26 +52,25 @@ class SigmoidCrossEntropyWithLogitsGradOp ...@@ -53,26 +52,25 @@ class SigmoidCrossEntropyWithLogitsGradOp
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Labels"), PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
"Input(Labels) should be not null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shoudl be not null."); "Input(Out@GRAD) shoudl be not null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@GRAD) should be not null."); "Output(X@GRAD) should be not null.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto labels_dims = ctx->GetInputDim("Labels"); auto labels_dims = ctx->GetInputDim("Label");
auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(labels_dims.size(), 2, PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
"Input(Labels)'s rank should be 2."); "Input(Label)'s rank should be 2.");
PADDLE_ENFORCE_EQ(dout_dims.size(), 2, PADDLE_ENFORCE_EQ(dout_dims.size(), 2,
"Input(Out@Grad)'s rank should be 2."); "Input(Out@Grad)'s rank should be 2.");
PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0], PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
"The 1st dimension of Input(X) and Input(Labels) should " "The 1st dimension of Input(X) and Input(Label) should "
"be equal."); "be equal.");
PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1], PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
"The 2nd dimension of Input(X) and Input(Labels) should " "The 2nd dimension of Input(X) and Input(Label) should "
"be equal."); "be equal.");
PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0], PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0],
"The 1st dimension of Input(X) and Input(Out@Grad) " "The 1st dimension of Input(X) and Input(Out@Grad) "
...@@ -97,7 +95,7 @@ class SigmoidCrossEntropyWithLogitsOpMaker ...@@ -97,7 +95,7 @@ class SigmoidCrossEntropyWithLogitsOpMaker
"This input is a tensor of logits computed by the previous " "This input is a tensor of logits computed by the previous "
" operator. Logits are unscaled log probabilities given as " " operator. Logits are unscaled log probabilities given as "
"log(p/(1-p))."); "log(p/(1-p)).");
AddInput("Labels", AddInput("Label",
"(Tensor, default Tensor<float>), a 2-D tensor of the same type " "(Tensor, default Tensor<float>), a 2-D tensor of the same type "
"and shape as X. This input is a tensor of probabalistic labels " "and shape as X. This input is a tensor of probabalistic labels "
"for each logit"); "for each logit");
......
...@@ -25,8 +25,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> { ...@@ -25,8 +25,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &context) const override { void Compute(const framework::ExecutionContext &context) const override {
const framework::Tensor *X = context.Input<framework::Tensor>("X"); const framework::Tensor *X = context.Input<framework::Tensor>("X");
const framework::Tensor *Labels = const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
context.Input<framework::Tensor>("Labels");
framework::Tensor *Out = context.Output<framework::Tensor>("Out"); framework::Tensor *Out = context.Output<framework::Tensor>("Out");
Out->mutable_data<T>(context.GetPlace()); Out->mutable_data<T>(context.GetPlace());
...@@ -52,8 +51,7 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> { ...@@ -52,8 +51,7 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &context) const override { void Compute(const framework::ExecutionContext &context) const override {
const framework::Tensor *X = context.Input<framework::Tensor>("X"); const framework::Tensor *X = context.Input<framework::Tensor>("X");
const framework::Tensor *Labels = const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
context.Input<framework::Tensor>("Labels");
const framework::Tensor *dOut = const framework::Tensor *dOut =
context.Input<framework::Tensor>(framework::GradVarName("Out")); context.Input<framework::Tensor>(framework::GradVarName("Out"));
framework::Tensor *dX = framework::Tensor *dX =
......
...@@ -22,22 +22,20 @@ class SmoothL1LossOp : public framework::OperatorWithKernel { ...@@ -22,22 +22,20 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized."); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized."); PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y"); auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same."); PADDLE_ENFORCE_EQ(x_dims, y_dims);
PADDLE_ENFORCE_GE(x_dims.size(), 2, PADDLE_ENFORCE_GE(x_dims.size(), 2,
"The tensor rank of X must be at least 2."); "The tensor rank of Input(X) should not be less than 2.");
if (ctx->HasInput("InsideWeight")) { if (ctx->HasInput("InsideWeight")) {
PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"), PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
"If weights are provided, must specify both " "If weights are provided, must specify both "
"inside and outside weights."); "inside and outside weights.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims, PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
"The shape of InsideWeight must be same as X."); PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims,
"The shape of OutsideWeight must be same as X.");
} }
ctx->SetOutputDim("Diff", x_dims); ctx->SetOutputDim("Diff", x_dims);
...@@ -53,25 +51,29 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -53,25 +51,29 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker* op_checker) framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", AddInput("X",
"The input tensor of smooth l1 loss op." "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
"The rank should be greater or equal to 2 with shape " "The input value of smooth l1 loss op with shape "
"[batch_size, value_dim1, value_dim2, ..., value_dimN]"); "[batch_size, dim1, ..., dimN].");
AddInput("Y", AddInput("Y",
"The target tensor of smooth l1 loss op " "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
"with the same shape as X."); "The target value of smooth l1 loss op with same shape as X.");
AddInput("InsideWeight", AddInput("InsideWeight",
"Optional input tensor of smooth l1 loss op with the same shape " "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
"as X. If provided, the result of (X - Y) will be multiplied " "This input is optional and should have same shape with X. "
"If provided, the result of (X - Y) will be multiplied "
"by this tensor element by element.") "by this tensor element by element.")
.AsDispensable(); .AsDispensable();
AddInput("OutsideWeight", AddInput("OutsideWeight",
"Optinal input of smooth l1 loss op with the same shape as X." "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
"If provided, the output smooth l1 loss will be multiplied by " "This input is optional and should have same shape with X. "
"this tensor element by element.") "If provided, the out smooth l1 loss will be multiplied by this "
"tensor element by element.")
.AsDispensable(); .AsDispensable();
AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).") AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
.AsIntermediate(); .AsIntermediate();
AddOutput("Out", "Smooth l1 loss."); AddOutput("Out",
"(Tensor, default Tensor<float>) A tensor with rank be 2. "
"The output smooth l1 loss with shape [batch_size, 1].");
AddAttr<AttrType>("sigma", AddAttr<AttrType>("sigma",
"Hyper parameter of smooth l1 loss op." "Hyper parameter of smooth l1 loss op."
"A float scalar with default value 3.0.") "A float scalar with default value 3.0.")
...@@ -79,15 +81,23 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -79,15 +81,23 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
Smooth L1 Loss Operator. Smooth L1 Loss Operator.
This operator computes the smooth l1 loss for input and target. This operator computes the smooth l1 loss for X and Y.
The operator takes the first dimension of input as the batch size. The operator takes the first dimension of X and Y as batch size.
For each instance, it computes the smooth l1 loss element by element first For each instance, it computes the smooth l1 loss element by element first
and then sums all the losses. So the resulting output shape and then sums all the losses. So the shape of Out is [batch_size, 1].
is [batch_size, 1].
The equation is: The equation is:
loss = $$0.5 * (\sigma * (x-y))^2$$ if $$|x - y| < 1 /({\sigma}^2)$$ $$
$$\frac{|x - y| - 0.5}{{\sigma}^2}$$ otherwise Out_{\sigma}(X, Y)_i = \begin{cases}
0.5 * (\sigma * (X_i - Y_i)) ^ 2
\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
\quad otherwise
\end{cases}
$$
In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
element of Out, X and Y.
)DOC"); )DOC");
} }
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/softmax_with_cross_entropy_op.h" #include "paddle/operators/softmax_with_cross_entropy_op.h"
#include <paddle/function/TensorType.h>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -37,10 +37,16 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -37,10 +37,16 @@ class SumOp : public framework::OperatorWithKernel {
size_t N = x_dims.size(); size_t N = x_dims.size();
PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1."); PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
auto in_dim = x_dims[0]; framework::DDim in_dim({0});
for (size_t i = 1; i < N; i++) { for (auto& x_dim : x_dims) {
auto dim = x_dims[i]; if (framework::product(x_dim) == 0) {
PADDLE_ENFORCE_EQ(in_dim, dim, "Input tensors must have same shape"); continue;
}
if (framework::product(in_dim) == 0) {
in_dim = x_dim;
} else {
PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape");
}
} }
ctx->SetOutputDim("Out", in_dim); ctx->SetOutputDim("Out", in_dim);
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
...@@ -51,8 +57,22 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -51,8 +57,22 @@ class SumOp : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
auto x_vars = ctx.MultiInputVar("X"); auto x_vars = ctx.MultiInputVar("X");
if (x_vars[0]->IsType<framework::LoDTensor>()) { if (x_vars[0]->IsType<framework::LoDTensor>()) {
return framework::OpKernelType( int dtype = -1;
framework::ToDataType(x_vars[0]->Get<framework::LoDTensor>().type()), for (auto& x_var : x_vars) {
auto& lod_tensor = x_var->Get<framework::LoDTensor>();
if (lod_tensor.numel() == 0) {
continue;
}
if (dtype == -1) {
dtype = framework::ToDataType(lod_tensor.type());
} else {
PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type()));
}
}
PADDLE_ENFORCE_NE(dtype, -1,
"Sum operator should have at least one tensor");
return framework::OpKernelType(static_cast<framework::DataType>(dtype),
ctx.device_context()); ctx.device_context());
} else if (x_vars[0]->IsType<framework::SelectedRows>()) { } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
return framework::OpKernelType( return framework::OpKernelType(
......
...@@ -53,6 +53,9 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -53,6 +53,9 @@ class SumKernel : public framework::OpKernel<T> {
for (int i = in_place ? 1 : 0; i < N; i++) { for (int i = in_place ? 1 : 0; i < N; i++) {
if (in_vars[i]->IsType<framework::LoDTensor>()) { if (in_vars[i]->IsType<framework::LoDTensor>()) {
auto &in_t = in_vars[i]->Get<framework::LoDTensor>(); auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
if (in_t.numel() == 0) {
continue;
}
auto in = EigenVector<T>::Flatten(in_t); auto in = EigenVector<T>::Flatten(in_t);
result.device(place) = result + in; result.device(place) = result + in;
} else if (in_vars[i]->IsType<framework::SelectedRows>()) { } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
...@@ -84,7 +87,7 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -84,7 +87,7 @@ class SumKernel : public framework::OpKernel<T> {
int64_t offset = 0; int64_t offset = 0;
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
PADDLE_ENFORCE_EQ(out->height(), PADDLE_ENFORCE_EQ(out->height(),
in_vars[i]->Get<SelectedRows>().height()) in_vars[i]->Get<SelectedRows>().height());
functor(context.device_context(), in_vars[i]->Get<SelectedRows>(), functor(context.device_context(), in_vars[i]->Get<SelectedRows>(),
offset, out); offset, out);
offset += in_vars[i]->Get<SelectedRows>().value().numel(); offset += in_vars[i]->Get<SelectedRows>().value().numel();
......
...@@ -27,7 +27,7 @@ class WriteToArrayOp : public ArrayOp { ...@@ -27,7 +27,7 @@ class WriteToArrayOp : public ArrayOp {
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::DeviceContext &dev_ctx) const override {
auto *x = scope.FindVar(Input("X")); auto *x = scope.FindVar(Input("X"));
PADDLE_ENFORCE(x != nullptr, "X must be set"); if (x == nullptr) return;
auto &x_tensor = x->Get<framework::LoDTensor>(); auto &x_tensor = x->Get<framework::LoDTensor>();
size_t offset = GetOffset(scope, dev_ctx); size_t offset = GetOffset(scope, dev_ctx);
auto *out = auto *out =
...@@ -37,9 +37,15 @@ class WriteToArrayOp : public ArrayOp { ...@@ -37,9 +37,15 @@ class WriteToArrayOp : public ArrayOp {
<< " to " << offset + 1; << " to " << offset + 1;
out->resize(offset + 1); out->resize(offset + 1);
} }
if (x_tensor.memory_size() > 0) {
auto *out_tensor = &out->at(offset); auto *out_tensor = &out->at(offset);
CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor); CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor);
out_tensor->set_lod(x_tensor.lod()); out_tensor->set_lod(x_tensor.lod());
} else {
VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
"nothing has been written to output array["
<< offset << "].";
}
} }
}; };
...@@ -70,7 +76,9 @@ class WriteToArrayInferShape : public framework::InferShapeBase { ...@@ -70,7 +76,9 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index"); PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1, PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
"The number of element of subscript index must be 1"); "The number of element of subscript index must be 1");
PADDLE_ENFORCE(context->HasInput("X"), NotHasXError()); if (!context->HasInput("X")) {
return;
}
PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError()); PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError());
context->SetOutputDim("Out", context->GetInputDim("X")); context->SetOutputDim("Out", context->GetInputDim("X"));
} }
...@@ -93,9 +101,10 @@ class WriteToArrayInferVarType : public framework::VarTypeInference { ...@@ -93,9 +101,10 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name), auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name),
"Cannot found %s", out_name); "Cannot found %s", out_name);
out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY); out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
auto &x = auto *x = block->FindVarRecursive(x_name);
detail::Ref(block->FindVarRecursive(x_name), "Cannot found %s", x_name); if (x != nullptr) {
out.SetDataType(x.GetDataType()); out.SetDataType(x->GetDataType());
}
} }
}; };
...@@ -115,10 +124,13 @@ class ReadFromArrayOp : public ArrayOp { ...@@ -115,10 +124,13 @@ class ReadFromArrayOp : public ArrayOp {
PADDLE_ENFORCE(out != nullptr, "Out must be set"); PADDLE_ENFORCE(out != nullptr, "Out must be set");
auto *out_tensor = out->GetMutable<framework::LoDTensor>(); auto *out_tensor = out->GetMutable<framework::LoDTensor>();
size_t offset = GetOffset(scope, dev_ctx); size_t offset = GetOffset(scope, dev_ctx);
PADDLE_ENFORCE_LT(offset, x_array.size()); if (offset < x_array.size()) {
framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx, framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx,
out_tensor); out_tensor);
out_tensor->set_lod(x_array[offset].lod()); out_tensor->set_lod(x_array[offset].lod());
} else {
VLOG(10) << "offset " << offset << " >= " << x_array.size();
}
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/unpool_op.h"
namespace paddle {
namespace operators {
class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
public:
Unpool2dOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"X",
"(Tensor) The input tensor of unpool operator. "
"The format of input tensor is NCHW. Where N is batch size, C is the "
"number of channels, H and W is the height and width of feature.");
AddInput(
"Indices",
"(Tensor) The input tensor of the indices given out by MaxPool2d. "
"The format of input tensor is NCHW. Where N is batch size, C is the "
"number of channels, H and W is the height and width of feature.");
AddOutput("Out",
"(Tensor) The output tensor of unpool operator."
"The format of output tensor is also NCHW."
"Where N is batch size, C is "
"the number of channels, H and W is the height and "
"width of feature.");
AddAttr<std::vector<int>>(
"ksize",
"(vector), the unpooling window size(height, width) "
"of unpooling operator.");
AddAttr<std::vector<int>>("strides",
"(vector, default:{1, 1}), "
"strides (height, width) of unpooling operator.")
.SetDefault({1, 1});
AddAttr<std::vector<int>>("paddings",
"(vector defalut:{0,0}), "
"paddings (height, width) of unpooling operator.")
.SetDefault({0, 0});
AddAttr<std::string>(
"unpooling_type",
"(string), unpooling type, can be \"max\" for max-unpooling ")
.InEnum({"max"});
AddComment(R"DOC(
"Input shape: $(N, C_{in}, H_{in}, W_{in})$
Output shape: $(N, C_{out}, H_{out}, W_{out})$
Where
$$
H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
$$
Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
/07/iccv2011.pdf
)DOC");
}
};
int OutputSize(int input_size, int ksize, int padding, int stride) {
int output_size = (input_size - 1) * stride - 2 * padding + ksize;
return output_size;
}
class UnpoolOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.device_context());
}
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of UnpoolOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Indices"),
"Input(Indices) of UnpoolOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of UnpoolOp should not be null.");
auto in_x_dims = ctx->GetInputDim("X");
auto in_y_dims = ctx->GetInputDim("Indices");
std::string unpooling_type =
ctx->Attrs().Get<std::string>("unpooling_type");
std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
PADDLE_ENFORCE(in_x_dims.size() == 4,
"Unpooling intput must be of 4-dimensional.");
PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
for (size_t i = 0; i < ksize.size(); ++i) {
output_shape.push_back(
OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
}
ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
}
};
class UnpoolOpGrad : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.device_context());
}
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Input(X@GRAD) should not be null.");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
ops::UnpoolOpGrad);
REGISTER_OP_CPU_KERNEL(unpool,
ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/unpool_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(unpool,
ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(
unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/unpooling.h"
namespace paddle {
namespace operators {
template <typename Place, typename T>
class UnpoolKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
auto* out = context.Output<framework::Tensor>("Out");
std::string unpooling_type = context.Attr<std::string>("unpooling_type");
std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
T* output_data = out->mutable_data<T>(context.GetPlace());
if (output_data) {
math::SetConstant<Place, T> set_zero;
set_zero(context.device_context(), out, static_cast<T>(0));
}
math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
}
};
template <typename Place, typename T>
class UnpoolGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
const framework::Tensor* out = context.Input<framework::Tensor>("Out");
const framework::Tensor* out_grad =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
framework::Tensor* in_x_grad =
context.Output<framework::Tensor>(framework::GradVarName("X"));
std::string unpooling_type = context.Attr<std::string>("unpooling_type");
std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
auto& device_ctx = context.device_context();
math::SetConstant<Place, T> zero;
if (in_x_grad) {
in_x_grad->mutable_data<T>(context.GetPlace());
zero(device_ctx, in_x_grad, static_cast<T>(0));
}
math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out,
*out_grad, in_x_grad);
}
};
} // namespace operators
} // namespace paddle
...@@ -98,8 +98,6 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -98,8 +98,6 @@ class WhileGradOp : public framework::OperatorBase {
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override { const platform::DeviceContext &dev_ctx) const override {
// PADDLE_ENFORCE(...)
framework::Executor executor(dev_ctx); framework::Executor executor(dev_ctx);
auto *block = Attr<framework::BlockDescBind *>(kStepBlock); auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
auto *program = block->Program(); auto *program = block->Program();
...@@ -124,8 +122,12 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -124,8 +122,12 @@ class WhileGradOp : public framework::OperatorBase {
auto inside_og_name = inside_og_names[i]; auto inside_og_name = inside_og_names[i];
VLOG(10) << "Linking outside " << outside_og_name << " --> inside " VLOG(10) << "Linking outside " << outside_og_name << " --> inside "
<< inside_og_name; << inside_og_name;
auto &og_outside = detail::Ref(scope.FindVar(outside_og_name)); auto &og_outside =
auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name)); detail::Ref(scope.FindVar(outside_og_name),
"Cannot find Outside Gradient %s", outside_og_name);
auto &og_inside =
detail::Ref(cur_scope.Var(inside_og_name),
"Cannot find inside gradient %s", inside_og_name);
if (og_outside.Type().hash_code() == if (og_outside.Type().hash_code() ==
typeid(framework::LoDTensor).hash_code()) { typeid(framework::LoDTensor).hash_code()) {
auto &outside_tensor = og_outside.Get<framework::LoDTensor>(); auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
...@@ -160,7 +162,7 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -160,7 +162,7 @@ class WhileGradOp : public framework::OperatorBase {
PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) { for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
if (pg_names[param_id] == framework::kEmptyVarName) { if (pg_names[param_id] == framework::kEmptyVarName) {
continue; // iterator doesn't have gradient continue; // parameter doesn't have gradient
} }
auto inside_grad_name = framework::GradVarName(p_names[param_id]); auto inside_grad_name = framework::GradVarName(p_names[param_id]);
...@@ -190,7 +192,6 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -190,7 +192,6 @@ class WhileGradOp : public framework::OperatorBase {
} }
} }
// sum gradient
auto new_inside_name = cur_scope.Rename(inside_grad_name); auto new_inside_name = cur_scope.Rename(inside_grad_name);
auto sum_op = framework::OpRegistry::CreateOp( auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {pg_names[param_id], new_inside_name}}}, "sum", {{"X", {pg_names[param_id], new_inside_name}}},
...@@ -207,18 +208,35 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -207,18 +208,35 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected: protected:
virtual std::unique_ptr<framework::OpDescBind> Apply() const { std::unique_ptr<framework::OpDescBind> Apply() const override {
auto *grad = new framework::OpDescBind(); auto *grad = new framework::OpDescBind();
grad->SetType("while_grad"); grad->SetType("while_grad");
grad->SetInput(kParameters, Input(kParameters)); grad->SetInput(kParameters, Input(kParameters));
grad->SetOutput(
framework::GradVarName(kParameters), // Not all of IGs will be generated by inner gradient operators of while op.
InputGrad(kParameters, /*do not drop empty gradient*/ false)); // Ignore IGs that is not generated by the inside block.
auto igs = InputGrad(kParameters, /*do not drop empty gradient*/ false);
std::unordered_set<std::string> all_outs;
for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) {
all_outs.insert(oname);
}
}
for (auto &each_ig : igs) {
if (all_outs.find(each_ig) == all_outs.end()) {
VLOG(10) << "Ignore " << each_ig;
each_ig = framework::kEmptyVarName;
}
}
grad->SetOutput(framework::GradVarName(kParameters), igs);
grad->SetInput(kOutputs, Output(kOutputs)); grad->SetInput(kOutputs, Output(kOutputs));
// OG should be re-calculated by step blocks, since many outputs of while op // OG should be re-calculated by step blocks, since many outputs of while op
// do not need to calculate gradients. // do not need to calculate gradients.
std::unordered_set<std::string> block_ins; std::unordered_set<std::string> block_ins;
auto *fwd_block = this->grad_block_[0]->ParentBlock();
{ {
for (auto &p : Input(kParameters)) { for (auto &p : Input(kParameters)) {
block_ins.insert(p); block_ins.insert(p);
...@@ -233,6 +251,13 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -233,6 +251,13 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
if (block_ins.find(input_name) != block_ins.end()) { if (block_ins.find(input_name) != block_ins.end()) {
continue; continue;
} }
// If the input of Op is generated by the forward block, do not make it
// as input again.
if (fwd_block->FindVar(input_name) != nullptr) {
continue;
}
extra_inputs.insert(input_name); extra_inputs.insert(input_name);
} }
...@@ -287,7 +312,6 @@ class WhileGradOpShapeInference : public framework::InferShapeBase { ...@@ -287,7 +312,6 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
auto p_names = ctx->Inputs(kParameters); auto p_names = ctx->Inputs(kParameters);
auto pg_names = ctx->Outputs(kParamGrads); auto pg_names = ctx->Outputs(kParamGrads);
auto dims = ctx->GetInputsDim(kParameters);
auto var_types = ctx->GetInputsVarType(kParameters); auto var_types = ctx->GetInputsVarType(kParameters);
std::vector<std::string> names_to_set; std::vector<std::string> names_to_set;
std::vector<framework::DDim> dims_to_set; std::vector<framework::DDim> dims_to_set;
...@@ -295,13 +319,14 @@ class WhileGradOpShapeInference : public framework::InferShapeBase { ...@@ -295,13 +319,14 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
if (pg_names[i] == framework::kEmptyVarName) { if (pg_names[i] == framework::kEmptyVarName) {
continue; continue;
} }
auto dims = ctx->GetInputsElementDim(kParameters, i);
if (var_types[i] == framework::VarDesc::LOD_TENSOR) { if (var_types[i] == framework::VarDesc::LOD_TENSOR) {
names_to_set.push_back(pg_names[i]); names_to_set.push_back(pg_names[i]);
dims_to_set.push_back(dims[i]); dims_to_set.push_back(dims);
} else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) { } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) {
// not sure how to set the dim of LOD_TENSOR_ARRAY // not sure how to set the dim of LOD_TENSOR_ARRAY
names_to_set.push_back(pg_names[i]); names_to_set.push_back(pg_names[i]);
dims_to_set.push_back(dims[i]); dims_to_set.push_back(dims);
} }
} }
ctx->SetDims(names_to_set, dims_to_set); ctx->SetDims(names_to_set, dims_to_set);
......
...@@ -127,8 +127,3 @@ TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); } ...@@ -127,8 +127,3 @@ TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); } TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); } TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
...@@ -46,8 +46,3 @@ TEST(TensorToProto, Case2) { ...@@ -46,8 +46,3 @@ TEST(TensorToProto, Case2) {
EXPECT_EQ(t1[i], t[i]); EXPECT_EQ(t1[i], t[i]);
} }
} }
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cuda_profiler_api.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
namespace paddle {
namespace platform {
void CudaProfilerInit(std::string output_file, std::string output_mode,
std::vector<std::string> config_flags) {
std::array<char, 128> buf;
std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
memcpy(buf.data(), tmpl.data(), tmpl.size());
auto result = mktemp(buf.data());
PADDLE_ENFORCE(strlen(result) != 0);
std::string config_file = result;
{
std::ofstream ofs(config_file, std::ios::out | std::ios::trunc);
PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
for (const auto& line : config_flags) {
ofs << line << std::endl;
}
}
PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
PADDLE_ENFORCE(
cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
}
void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); }
} // namespace platform
} // namespace paddle
...@@ -37,6 +37,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); ...@@ -37,6 +37,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
#endif #endif
#ifdef CUDNN_DNN_ROUTINE_EACH_R7
CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
#endif
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -135,6 +135,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -135,6 +135,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
#if CUDNN_VERSION >= 7001
#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(cudnnSetConvolutionGroupCount);
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -235,15 +235,23 @@ inline void throw_on_error(T e) { ...@@ -235,15 +235,23 @@ inline void throw_on_error(T e) {
#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \
PADDLE_ENFORCE(nullptr != (__VAL), #__VAL " should not be null\n%s", \ do { \
paddle::string::Sprintf("" __VA_ARGS__)); if (UNLIKELY(nullptr == (__VAL))) { \
PADDLE_THROW(#__VAL " should not be null\n%s", \
paddle::string::Sprintf("" __VA_ARGS__)); \
} \
} while (0)
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
PADDLE_ENFORCE(__VAL0 __CMP __VAL1, \ do { \
"enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \ if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) { \
PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP \
" %s\n%s", \
#__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \ #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
paddle::string::to_string(__VAL1), \ paddle::string::to_string(__VAL1), \
paddle::string::Sprintf("" __VA_ARGS__)); paddle::string::Sprintf("" __VA_ARGS__)); \
} \
} while (0)
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -18,8 +18,8 @@ limitations under the License. */ ...@@ -18,8 +18,8 @@ limitations under the License. */
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
DEFINE_double(fraction_of_gpu_memory_to_use, 0.95, DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
"Default use 95% of GPU memory for PaddlePaddle," "Default use 92% of GPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc"); "reserve the rest for page tables, etc");
namespace paddle { namespace paddle {
...@@ -75,15 +75,19 @@ size_t GpuMaxChunkSize() { ...@@ -75,15 +75,19 @@ size_t GpuMaxChunkSize() {
GpuMemoryUsage(available, total); GpuMemoryUsage(available, total);
// Reserving the rest memory for page tables, etc. // Reserving the rest memory for page tables, etc.
size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total; size_t reserving = 0.05 * total;
// If available less than minimum chunk size, no usable memory exists. // If available less than minimum chunk size, no usable memory exists.
available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(); available =
std::max(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
reserving) -
reserving;
// If available less than reserving, no usable memory exists. size_t allocating = FLAGS_fraction_of_gpu_memory_to_use * total;
size_t usable = std::max(available, reserving) - reserving;
return usable; PADDLE_ENFORCE_LT(allocating, available);
return allocating;
} }
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst, const void *src, size_t count,
......
...@@ -49,7 +49,7 @@ if(WITH_TESTING) ...@@ -49,7 +49,7 @@ if(WITH_TESTING)
add_subdirectory(test) add_subdirectory(test)
endif() endif()
if(NOT WITH_C_API) if(NOT MOBILE_INFERENCE)
add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES}) add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES})
link_paddle_exe(paddle_pserver_main) link_paddle_exe(paddle_pserver_main)
......
...@@ -5,4 +5,6 @@ if(WITH_PYTHON) ...@@ -5,4 +5,6 @@ if(WITH_PYTHON)
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
endif(WITH_PYTHON) endif(WITH_PYTHON)
cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB}) if(WITH_DOC)
cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
endif(WITH_DOC)
...@@ -37,6 +37,7 @@ limitations under the License. */ ...@@ -37,6 +37,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/operators/nccl/nccl_gpu_common.h" #include "paddle/operators/nccl/nccl_gpu_common.h"
#include "paddle/platform/cuda_profiler.h"
#include "paddle/platform/gpu_info.h" #include "paddle/platform/gpu_info.h"
#endif #endif
...@@ -460,6 +461,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -460,6 +461,10 @@ All parameter, weight, gradient are variables in Paddle.
m.def("op_support_gpu", OpSupportGPU); m.def("op_support_gpu", OpSupportGPU);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
m.def("get_cuda_device_count", platform::GetCUDADeviceCount); m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
m.def("nvprof_init", platform::CudaProfilerInit);
m.def("nvprof_start", platform::CudaProfilerStart);
m.def("nvprof_stop", platform::CudaProfilerStop);
#endif #endif
return m.ptr(); return m.ptr();
......
...@@ -36,6 +36,7 @@ function cmake_gen() { ...@@ -36,6 +36,7 @@ function cmake_gen() {
${PYTHON_FLAGS} ${PYTHON_FLAGS}
-DWITH_DOC=OFF -DWITH_DOC=OFF
-DWITH_GPU=${WITH_GPU:-OFF} -DWITH_GPU=${WITH_GPU:-OFF}
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
-DWITH_MKL=${WITH_MKL:-ON} -DWITH_MKL=${WITH_MKL:-ON}
-DWITH_AVX=${WITH_AVX:-OFF} -DWITH_AVX=${WITH_AVX:-OFF}
-DWITH_GOLANG=${WITH_GOLANG:-ON} -DWITH_GOLANG=${WITH_GOLANG:-ON}
...@@ -57,6 +58,7 @@ EOF ...@@ -57,6 +58,7 @@ EOF
${PYTHON_FLAGS} \ ${PYTHON_FLAGS} \
-DWITH_DOC=OFF \ -DWITH_DOC=OFF \
-DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_GPU=${WITH_GPU:-OFF} \
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
-DWITH_MKL=${WITH_MKL:-ON} \ -DWITH_MKL=${WITH_MKL:-ON} \
-DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_AVX=${WITH_AVX:-OFF} \
-DWITH_GOLANG=${WITH_GOLANG:-ON} \ -DWITH_GOLANG=${WITH_GOLANG:-ON} \
...@@ -183,6 +185,14 @@ EOF ...@@ -183,6 +185,14 @@ EOF
${DOCKERFILE_GPU_ENV} ${DOCKERFILE_GPU_ENV}
ADD go/cmd/pserver/pserver /usr/bin/ ADD go/cmd/pserver/pserver /usr/bin/
ADD go/cmd/master/master /usr/bin/ ADD go/cmd/master/master /usr/bin/
EOF
if [[ ${WITH_DOC:-OFF} == 'ON' ]]; then
cat >> /paddle/build/Dockerfile <<EOF
ADD paddle/pybind/print_operators_doc /usr/bin/
EOF
fi
cat >> /paddle/build/Dockerfile <<EOF
# default command shows the paddle version and exit # default command shows the paddle version and exit
CMD ["paddle", "version"] CMD ["paddle", "version"]
EOF EOF
......
...@@ -5,4 +5,8 @@ if(WITH_TESTING) ...@@ -5,4 +5,8 @@ if(WITH_TESTING)
add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies}) add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
add_library(paddle_test_util STATIC TestUtil.cpp) add_library(paddle_test_util STATIC TestUtil.cpp)
add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies}) add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
if(NOT MOBILE_INFERENCE)
add_library(paddle_gtest_main STATIC paddle_gtest_main.cc)
add_dependencies(paddle_gtest_main paddle_memory gtest gflags)
endif()
endif() endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstring>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/memory/memory.h"
int main(int argc, char** argv) {
std::vector<char*> new_argv;
std::string gflags_env;
new_argv.push_back(argv[0]);
#ifdef PADDLE_WITH_CUDA
new_argv.push_back(
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
#else
new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
#endif
int new_argc = static_cast<int>(new_argv.size());
char** new_argv_address = new_argv.data();
google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
testing::InitGoogleTest(&argc, argv);
paddle::memory::Used(paddle::platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
paddle::memory::Used(paddle::platform::GPUPlace(0));
#endif
return RUN_ALL_TESTS();
}
...@@ -54,7 +54,7 @@ if(WITH_TESTING) ...@@ -54,7 +54,7 @@ if(WITH_TESTING)
add_subdirectory(tests) add_subdirectory(tests)
endif() endif()
if(NOT WITH_C_API) if(NOT MOBILE_INFERENCE)
add_paddle_exe(paddle_trainer TrainerMain.cpp) add_paddle_exe(paddle_trainer TrainerMain.cpp)
add_paddle_exe(paddle_merge_model MergeModel.cpp) add_paddle_exe(paddle_merge_model MergeModel.cpp)
...@@ -74,7 +74,5 @@ endif() ...@@ -74,7 +74,5 @@ endif()
if(WITH_GOLANG) if(WITH_GOLANG)
add_dependencies(paddle_trainer_lib paddle_pserver_cclient) add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
target_link_libraries(paddle_trainer_lib paddle_pserver_cclient) target_link_libraries(paddle_trainer_lib paddle_pserver_cclient)
if(NOT WITH_C_API)
target_link_libraries(paddle_trainer paddle_pserver_cclient) target_link_libraries(paddle_trainer paddle_pserver_cclient)
endif()
endif(WITH_GOLANG) endif(WITH_GOLANG)
################# test_Compare ############################ set(PYTHON_PATH
add_unittest_without_exec(test_Compare ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
test_Compare.cpp) ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests)
add_test(NAME test_Compare function(trainer_test TARGET)
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
${CMAKE_CURRENT_BINARY_DIR}/test_Compare add_test(NAME ${TARGET}
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endfunction()
################# test_Trainer ########################### trainer_test(test_Compare)
add_unittest_without_exec(test_Trainer trainer_test(test_PyDataProviderWrapper)
test_Trainer.cpp) trainer_test(test_recurrent_machine_generation)
add_test(NAME test_Trainer trainer_test(test_Trainer)
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
############### test_TrainerOnePass ########################## ############### test_TrainerOnePass ##########################
if(WITH_PYTHON) if(WITH_PYTHON)
...@@ -22,32 +20,13 @@ if(WITH_PYTHON) ...@@ -22,32 +20,13 @@ if(WITH_PYTHON)
add_unittest_without_exec(test_TrainerOnePass add_unittest_without_exec(test_TrainerOnePass
test_TrainerOnePass.cpp) test_TrainerOnePass.cpp)
add_test(NAME test_TrainerOnePass add_test(NAME test_TrainerOnePass
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif() endif()
################# test_recurrent_machine_generation ###############
add_unittest_without_exec(test_recurrent_machine_generation
test_recurrent_machine_generation.cpp)
add_test(NAME test_recurrent_machine_generation
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
${CMAKE_CURRENT_BINARY_DIR}/test_recurrent_machine_generation
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
#################### test_PyDataProviderWrapper #########################
add_unittest_without_exec(test_PyDataProviderWrapper
test_PyDataProviderWrapper.cpp)
add_test(NAME test_PyDataProviderWrapper
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProviderWrapper
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
#################### test_config_parser ######################### #################### test_config_parser #########################
add_test(NAME test_config_parser add_test(NAME test_config_parser
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE}
${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
...@@ -2400,6 +2400,14 @@ class CropLayer(LayerBase): ...@@ -2400,6 +2400,14 @@ class CropLayer(LayerBase):
image_conf.img_size_y = input_layer.height image_conf.img_size_y = input_layer.height
image_conf.channels = input_layer.size / (input_layer.width * image_conf.channels = input_layer.size / (input_layer.width *
input_layer.height) input_layer.height)
# only support for 4-dims inputs and NCHW order
if (len(self.config.inputs) == 2):
self.set_layer_height_width(
self.get_input_layer(1).height, self.get_input_layer(1).width)
self.set_layer_size(self.get_input_layer(1).size)
else:
self.set_layer_height_width(shape[-2], shape[-1])
self.set_layer_size(reduce(lambda x, y: x * y, shape[1:]))
@config_layer('batch_norm') @config_layer('batch_norm')
...@@ -3849,6 +3857,26 @@ class SwitchOrderLayer(LayerBase): ...@@ -3849,6 +3857,26 @@ class SwitchOrderLayer(LayerBase):
name, 'switch_order', 0, inputs=inputs, **xargs) name, 'switch_order', 0, inputs=inputs, **xargs)
self.config.reshape_conf.height_axis.extend(reshape['height']) self.config.reshape_conf.height_axis.extend(reshape['height'])
self.config.reshape_conf.width_axis.extend(reshape['width']) self.config.reshape_conf.width_axis.extend(reshape['width'])
input_layer = self.get_input_layer(0)
if reshape is None:
self.set_layer_size(input_layer.size)
else:
in_h = input_layer.height
in_w = input_layer.width
out_dims = None
if input_layer.has_depth():
in_d = input_layer.depth
in_c = input_layer.size / in_h / in_w / in_d
# batch_size, depth, height, width, channel
out_dims = [0, in_d, in_h, in_w, in_c]
else:
in_c = input_layer.size / in_h / in_w
# batch_size, height, width, channel
out_dims = [0, in_h, in_w, in_c]
# Because (reshape['width'][0] > 0) always be true.
# So out_dims[0] won't be used.
size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
self.set_layer_size(size)
@config_layer('scale_sub_region') @config_layer('scale_sub_region')
......
...@@ -6873,6 +6873,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None): ...@@ -6873,6 +6873,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
:param input: The input of this layer. If two inputs are given, the second one :param input: The input of this layer. If two inputs are given, the second one
will be regarded as the reference. will be regarded as the reference.
And the input must be 4-dims and in NCHW order.
:type input: LayerOutput | Sequence :type input: LayerOutput | Sequence
:param offset: The crop offset. :param offset: The crop offset.
:type offset: Sequence :type offset: Sequence
......
...@@ -13,13 +13,15 @@ import nets ...@@ -13,13 +13,15 @@ import nets
import optimizer import optimizer
import backward import backward
import regularizer import regularizer
from param_attr import ParamAttr
from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, GPUPlace from core import LoDTensor, CPUPlace, GPUPlace
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + [ __all__ = framework.__all__ + executor.__all__ + [
'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward', 'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor' 'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
'DataFeeder'
] ]
...@@ -35,7 +37,8 @@ def __read_gflags_from_env__(): ...@@ -35,7 +37,8 @@ def __read_gflags_from_env__():
read_env_flags = ['use_pinned_memory'] read_env_flags = ['use_pinned_memory']
if core.is_compile_gpu(): if core.is_compile_gpu():
read_env_flags.append('fraction_of_gpu_memory_to_use') read_env_flags.append('fraction_of_gpu_memory_to_use')
core.init_gflags(sys.argv + ["--tryfromenv=" + ",".join(read_env_flags)]) core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)])
__read_gflags_from_env__() __read_gflags_from_env__()
from __future__ import print_function
import core
import numpy
import six.moves as six
from framework import Variable
__all__ = ['DataFeeder']
class DataToLoDTensorConverter(object):
def __init__(self, place, lod_level, shape, dtype):
self.place = place
self.lod_level = lod_level
self.shape = shape
if dtype == core.DataType.FP32:
self.dtype = 'float32'
elif dtype == core.DataType.INT64:
self.dtype = 'int64'
elif dtype == core.DataType.FP64:
self.dtype = 'float64'
elif dtype == core.DataType.INT32:
self.dtype = 'int32'
else:
raise ValueError("dtype must be any of [int32, float32, int64, "
"float64]")
self.data = []
self.lod = []
for i in six.range(lod_level):
self.lod.append([0])
def feed(self, data):
self._feed_impl_(data, self.lod, self.lod_level)
def _feed_impl_(self, data, lod, lod_level):
if lod_level == 0:
self.data.append(data)
else:
cur_lod_len = len(data)
lod[-1].append(lod[-1][-1] + cur_lod_len)
for each_data in data:
self._feed_impl_(each_data, lod[:-1], lod_level - 1)
def done(self):
arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)
t = core.LoDTensor()
t.set(arr, self.place)
if self.lod_level > 0:
t.set_lod(self.lod)
return t
class DataFeeder(object):
def __init__(self, feed_list, place):
self.feed_dtypes = []
self.feed_names = []
self.feed_shapes = []
self.feed_lod_level = []
for each_var in feed_list:
if not isinstance(each_var, Variable):
raise TypeError("Feed list should contain a list of variable")
self.feed_dtypes.append(each_var.dtype)
self.feed_names.append(each_var.name)
shape = each_var.shape
batch_size_dim = -1
for i, s in enumerate(shape):
if s < 0:
batch_size_dim = i
break
if batch_size_dim == -1:
raise ValueError("Variable {0} must has a batch size dimension",
each_var.name)
self.feed_lod_level.append(each_var.lod_level)
self.feed_shapes.append(shape)
self.place = place
def feed(self, iterable):
converter = []
for lod_level, shape, dtype in six.zip(
self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
converter.append(
DataToLoDTensorConverter(
place=self.place,
lod_level=lod_level,
shape=shape,
dtype=dtype))
for each_sample in iterable:
for each_converter, each_slot in six.zip(converter, each_sample):
each_converter.feed(each_slot)
ret_dict = {}
for each_name, each_converter in six.zip(self.feed_names, converter):
ret_dict[each_name] = each_converter.done()
return ret_dict
...@@ -26,9 +26,9 @@ class Evaluator(object): ...@@ -26,9 +26,9 @@ class Evaluator(object):
name(str): The name of evaluator. such as, "accuracy". Used for generate name(str): The name of evaluator. such as, "accuracy". Used for generate
temporary variable name. temporary variable name.
main_program(Program, optional): The evaluator should be added to this main_program(Program, optional): The evaluator should be added to this
main_program. Default g_main_program main_program. Default default_main_program()
startup_program(Program, optional):The parameter should be added to this startup_program(Program, optional):The parameter should be added to this
startup_program. Default g_startup_program startup_program. Default default_startup_program()
Attributes: Attributes:
states(list): The list of state variables. states will be reset to zero states(list): The list of state variables. states will be reset to zero
......
import numpy as np import numpy as np
from . import core from . import core
from framework import Program, g_main_program from framework import Program, default_main_program
__all__ = ['Executor', 'g_scope'] __all__ = ['Executor', 'g_scope']
...@@ -103,7 +103,7 @@ class Executor(object): ...@@ -103,7 +103,7 @@ class Executor(object):
fetch_list = [] fetch_list = []
if program is None: if program is None:
program = g_main_program program = default_main_program()
if not isinstance(program, Program): if not isinstance(program, Program):
raise TypeError() raise TypeError()
......
...@@ -3,10 +3,12 @@ import collections ...@@ -3,10 +3,12 @@ import collections
import numpy as np import numpy as np
from . import core from . import core
import proto.framework_pb2 as framework_pb2 import proto.framework_pb2 as framework_pb2
import contextlib
__all__ = [ __all__ = [
'Block', 'Variable', 'Program', 'Operator', 'default_startup_program', 'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
'default_main_program', 'g_startup_program', 'g_main_program' 'default_main_program', 'program_guard', 'switch_startup_program',
'switch_main_program'
] ]
...@@ -654,13 +656,88 @@ class Parameter(Variable): ...@@ -654,13 +656,88 @@ class Parameter(Variable):
# program is a global instance. # program is a global instance.
g_main_program = Program() _main_program_ = Program()
g_startup_program = Program() _startup_program_ = Program()
def default_startup_program(): def default_startup_program():
return g_startup_program """
Get default startup program. In startup program, Paddle will initialize
parameters, initialize nccl handle, etc.
Returns:
Program: startup program
"""
return _startup_program_
def default_main_program(): def default_main_program():
return g_main_program """
Get default main program. The main program is used for training or testing.
Returns:
Program: main program
"""
return _main_program_
def switch_main_program(program):
"""
Switch the main program to a new program.
Args:
program(Program): The new main program
Returns:
Program: The previous main program
"""
global _main_program_
prev_program = _main_program_
_main_program_ = program
return prev_program
def switch_startup_program(program):
"""
Switch the startup program to a new program
Args:
program(Program): The new startup program
Returns:
Program: The previous startup program
"""
global _startup_program_
prev_program = _startup_program_
_startup_program_ = program
return prev_program
@contextlib.contextmanager
def program_guard(main_program, startup_program=None):
"""
Switch program with `with` statement
Examples:
>>> with program_guard(Program()):
>>> data = fluid.layers.data(...)
>>> hidden = fluid.layers.fc(...)
Args:
main_program(Program): New main program inside `with` statement
startup_program(Program): New startup program inside `with` statement.
None means do not change startup program.
Returns:
None
"""
if not isinstance(main_program, Program):
raise TypeError("main_program should be Program")
main_program = switch_main_program(main_program)
if startup_program is not None:
if not isinstance(startup_program, Program):
raise TypeError("startup_program should be Program")
startup_program = switch_startup_program(startup_program)
yield
switch_main_program(main_program)
if startup_program is not None:
switch_startup_program(startup_program)
import os import os
import cPickle as pickle import cPickle as pickle
from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \ from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
Variable
__all__ = [ __all__ = [
'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
...@@ -46,7 +45,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None): ...@@ -46,7 +45,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
""" """
if vars is None: if vars is None:
if main_program is None: if main_program is None:
main_program = g_main_program main_program = default_main_program()
if not isinstance(main_program, Program): if not isinstance(main_program, Program):
raise TypeError("program should be as Program type or None") raise TypeError("program should be as Program type or None")
...@@ -98,7 +97,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None): ...@@ -98,7 +97,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
:param executor: executor that save variable :param executor: executor that save variable
:param dirname: directory path :param dirname: directory path
:param main_program: program. If vars is None, then filter all variables in this :param main_program: program. If vars is None, then filter all variables in this
program which fit `predicate`. Default g_program. program which fit `predicate`. Default default_main_program().
:param predicate: The Predicate describes a callable that returns a variable :param predicate: The Predicate describes a callable that returns a variable
as a bool. If it returns true, the variables will be loaded. as a bool. If it returns true, the variables will be loaded.
:param vars: variables need to be loaded. If specify vars, program & :param vars: variables need to be loaded. If specify vars, program &
...@@ -107,7 +106,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None): ...@@ -107,7 +106,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
""" """
if vars is None: if vars is None:
if main_program is None: if main_program is None:
main_program = g_main_program main_program = default_main_program()
if not isinstance(main_program, Program): if not isinstance(main_program, Program):
raise TypeError("program's type should be Program") raise TypeError("program's type should be Program")
...@@ -154,7 +153,7 @@ def load_persistables(executor, dirname, main_program=None): ...@@ -154,7 +153,7 @@ def load_persistables(executor, dirname, main_program=None):
def get_inference_program(target_vars, main_program=None): def get_inference_program(target_vars, main_program=None):
if main_program is None: if main_program is None:
main_program = g_main_program main_program = default_main_program()
if not isinstance(target_vars, list): if not isinstance(target_vars, list):
target_vars = [target_vars] target_vars = [target_vars]
...@@ -177,12 +176,12 @@ def save_inference_model(dirname, ...@@ -177,12 +176,12 @@ def save_inference_model(dirname,
:param target_vars: Variables from which we can get inference results. :param target_vars: Variables from which we can get inference results.
:param executor: executor that save inference model :param executor: executor that save inference model
:param main_program: original program, which will be pruned to build the inference model. :param main_program: original program, which will be pruned to build the inference model.
Default g_main_program. Default default_main_program().
:return: None :return: None
""" """
if main_program is None: if main_program is None:
main_program = g_main_program main_program = default_main_program()
if not isinstance(target_vars, list): if not isinstance(target_vars, list):
target_vars = [target_vars] target_vars = [target_vars]
...@@ -272,10 +271,10 @@ def get_parameter_value_by_name(name, executor, program=None): ...@@ -272,10 +271,10 @@ def get_parameter_value_by_name(name, executor, program=None):
:param executor: executor for retrieving the value :param executor: executor for retrieving the value
:param name: the name of the parameter :param name: the name of the parameter
:param program: the program where the variable is found :param program: the program where the variable is found
Default g_main_program. Default default_main_program().
:return: the LoDTensor for the variable :return: the LoDTensor for the variable
""" """
if program is None: if program is None:
program = g_main_program program = default_main_program()
var = program.global_block().var(name) var = program.global_block().var(name)
return get_parameter_value(var, executor) return get_parameter_value(var, executor)
import copy import copy
import itertools import itertools
from framework import Variable, g_main_program, \ from framework import Variable, default_main_program, default_startup_program, \
g_startup_program, unique_name, dtype_is_floating unique_name, dtype_is_floating
from paddle.v2.fluid.initializer import Constant, Xavier from paddle.v2.fluid.initializer import Constant, Xavier
from param_attr import ParamAttr
class LayerHelper(object): class LayerHelper(object):
...@@ -22,7 +23,7 @@ class LayerHelper(object): ...@@ -22,7 +23,7 @@ class LayerHelper(object):
def main_program(self): def main_program(self):
prog = self.kwargs.get('main_program', None) prog = self.kwargs.get('main_program', None)
if prog is None: if prog is None:
return g_main_program return default_main_program()
else: else:
return prog return prog
...@@ -30,7 +31,7 @@ class LayerHelper(object): ...@@ -30,7 +31,7 @@ class LayerHelper(object):
def startup_program(self): def startup_program(self):
prog = self.kwargs.get('startup_program', None) prog = self.kwargs.get('startup_program', None)
if prog is None: if prog is None:
return g_startup_program return default_startup_program()
else: else:
return prog return prog
...@@ -60,31 +61,15 @@ class LayerHelper(object): ...@@ -60,31 +61,15 @@ class LayerHelper(object):
@property @property
def param_attr(self): def param_attr(self):
default = {'name': None} return ParamAttr.to_attr(self.kwargs.get('param_attr', None))
actual = self.kwargs.get('param_attr', None)
if actual is None:
actual = default
for default_field in default.keys():
if default_field not in actual:
actual[default_field] = default[default_field]
return actual
@property @property
def bias_attr(self): def bias_attr(self):
default = {'name': None} return ParamAttr.to_attr(self.kwargs.get('bias_attr', None))
bias_attr = self.kwargs.get('bias_attr', None)
if bias_attr is None:
bias_attr = default
if isinstance(bias_attr, dict):
for default_field in default.keys():
if default_field not in bias_attr:
bias_attr[default_field] = default[default_field]
return bias_attr
def multiple_param_attr(self, length): def multiple_param_attr(self, length):
param_attr = self.param_attr param_attr = self.param_attr
if isinstance(param_attr, dict): if isinstance(param_attr, ParamAttr):
param_attr = [param_attr] param_attr = [param_attr]
if len(param_attr) != 1 and len(param_attr) != length: if len(param_attr) != 1 and len(param_attr) != length:
...@@ -112,23 +97,30 @@ class LayerHelper(object): ...@@ -112,23 +97,30 @@ class LayerHelper(object):
raise ValueError("Data Type mismatch") raise ValueError("Data Type mismatch")
return dtype return dtype
def create_parameter(self, attr, shape, dtype, suffix='w', def create_parameter(self,
initializer=None): attr,
shape,
dtype,
is_bias=False,
default_initializer=None):
# Deepcopy the attr so that parameters can be shared in program # Deepcopy the attr so that parameters can be shared in program
attr_copy = copy.deepcopy(attr) assert isinstance(attr, ParamAttr)
if initializer is not None: suffix = 'b' if is_bias else 'w'
attr_copy['initializer'] = initializer
if default_initializer is None:
if is_bias:
attr.set_default_bias_initializer()
else:
attr.set_default_param_initializer()
else: else:
attr_copy['initializer'] = self._get_default_initializer(dtype) attr.set_default_initializer(default_initializer)
if attr_copy['name'] is None: if attr.name is None:
attr_copy['name'] = unique_name(".".join([self.name, suffix])) attr.name = unique_name(".".join([self.name, suffix]))
self.startup_program.global_block().create_parameter( self.startup_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr_copy) dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
return self.main_program.global_block().create_parameter( return self.main_program.global_block().create_parameter(
name=attr_copy['name'], dtype=dtype, shape=shape, **attr.to_kwargs())
dtype=dtype,
shape=shape,
trainable=attr_copy.get('trainable', True))
def create_tmp_variable(self, dtype): def create_tmp_variable(self, dtype):
return self.main_program.current_block().create_var( return self.main_program.current_block().create_var(
...@@ -153,11 +145,7 @@ class LayerHelper(object): ...@@ -153,11 +145,7 @@ class LayerHelper(object):
persistable=True, persistable=True,
initializer=initializer) initializer=initializer)
def append_bias_op(self, def append_bias_op(self, input_var, dim_start=1, dim_end=None):
input_var,
bias_initializer,
dim_start=1,
dim_end=None):
""" """
Append bias operator and return its output. If the user does not set Append bias operator and return its output. If the user does not set
bias_attr, append_bias_op will return input_var bias_attr, append_bias_op will return input_var
...@@ -177,11 +165,7 @@ class LayerHelper(object): ...@@ -177,11 +165,7 @@ class LayerHelper(object):
return input_var return input_var
b = self.create_parameter( b = self.create_parameter(
attr=bias_attr, attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
shape=size,
dtype=input_var.dtype,
suffix='b',
initializer=bias_initializer)
tmp = self.create_tmp_variable(dtype=input_var.dtype) tmp = self.create_tmp_variable(dtype=input_var.dtype)
self.append_op( self.append_op(
type='elementwise_add', type='elementwise_add',
......
from . import core import core
import proto.framework_pb2 as framework_pb2 import proto.framework_pb2 as framework_pb2
from framework import OpProtoHolder, Variable, Program, Operator from framework import OpProtoHolder, Variable, Program, Operator
from initializer import Constant, Normal, Xavier from initializer import Constant, Normal, Xavier, Initializer
from paddle.v2.fluid.layer_helper import LayerHelper, unique_name from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
import re import re
import cStringIO import cStringIO
from param_attr import ParamAttr
__all__ = [ __all__ = [
'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat', 'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool', 'sums', 'cos_sim', 'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool', 'sums', 'cos_sim',
'batch_norm', 'accuracy', 'split_lod_tensor' 'batch_norm', 'accuracy', 'split_lod_tensor', 'While'
] ]
...@@ -17,9 +18,7 @@ def fc(input, ...@@ -17,9 +18,7 @@ def fc(input,
size, size,
num_flatten_dims=1, num_flatten_dims=1,
param_attr=None, param_attr=None,
param_initializer=None,
bias_attr=None, bias_attr=None,
bias_initializer=None,
act=None, act=None,
name=None, name=None,
main_program=None, main_program=None,
...@@ -32,11 +31,9 @@ def fc(input, ...@@ -32,11 +31,9 @@ def fc(input,
size: The size of the layer size: The size of the layer
num_flatten_dims: Number of columns in input num_flatten_dims: Number of columns in input
param_attr: The parameters/weights to the FC Layer param_attr: The parameters/weights to the FC Layer
param_initializer: Initializer used for the weight/parameter. param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used
If None, XavierInitializer() is used
bias_attr: The bias parameter for the FC layer bias_attr: The bias parameter for the FC layer
bias_initializer: Initializer used for the bias. bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used
If None, then ConstantInitializer() is used
act: Activation to be applied to the output of FC layer act: Activation to be applied to the output of FC layer
name: Name/alias of the function name: Name/alias of the function
main_program: Name of the main program that calls this main_program: Name of the main program that calls this
...@@ -54,23 +51,10 @@ def fc(input, ...@@ -54,23 +51,10 @@ def fc(input,
to the LayerHelper constructor. to the LayerHelper constructor.
""" """
def _get_default_param_initializer():
return Xavier()
def _get_default_bias_initializer():
return Constant()
helper = LayerHelper('fc', **locals()) helper = LayerHelper('fc', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
if param_initializer is None:
param_initializer = _get_default_param_initializer()
if bias_initializer is None:
bias_initializer = _get_default_bias_initializer()
mul_results = [] mul_results = []
for input_var, param_attr in helper.iter_inputs_and_params(): for input_var, param_attr in helper.iter_inputs_and_params():
input_shape = input_var.shape input_shape = input_var.shape
...@@ -78,10 +62,7 @@ def fc(input, ...@@ -78,10 +62,7 @@ def fc(input,
reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1) reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
] + [size] ] + [size]
w = helper.create_parameter( w = helper.create_parameter(
attr=param_attr, attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
initializer=param_initializer,
shape=param_shape,
dtype=dtype)
tmp = helper.create_tmp_variable(dtype) tmp = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
type="mul", type="mul",
...@@ -102,7 +83,7 @@ def fc(input, ...@@ -102,7 +83,7 @@ def fc(input,
helper.append_op( helper.append_op(
type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias}) type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
# add bias # add bias
pre_activation = helper.append_bias_op(pre_bias, bias_initializer) pre_activation = helper.append_bias_op(pre_bias)
# add activation # add activation
return helper.append_activation(pre_activation) return helper.append_activation(pre_activation)
...@@ -110,7 +91,6 @@ def fc(input, ...@@ -110,7 +91,6 @@ def fc(input,
def embedding(input, def embedding(input,
size, size,
is_sparse=False, is_sparse=False,
param_initializer=None,
param_attr=None, param_attr=None,
dtype='float32', dtype='float32',
main_program=None, main_program=None,
...@@ -119,6 +99,7 @@ def embedding(input, ...@@ -119,6 +99,7 @@ def embedding(input,
Embedding Layer. Embedding Layer.
Args: Args:
param_initializer:
input: The input to the function input: The input to the function
size: The size of the layer size: The size of the layer
is_sparse: A flag that decleares whether the input is sparse is_sparse: A flag that decleares whether the input is sparse
...@@ -136,15 +117,9 @@ def embedding(input, ...@@ -136,15 +117,9 @@ def embedding(input,
""" """
def _get_default_param_initializer():
return Xavier()
helper = LayerHelper('embedding', **locals()) helper = LayerHelper('embedding', **locals())
w = helper.create_parameter( w = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
shape=size,
dtype=dtype,
initializer=param_initializer or _get_default_param_initializer())
tmp = helper.create_tmp_variable(dtype) tmp = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
type='lookup_table', type='lookup_table',
...@@ -176,7 +151,7 @@ def dynamic_lstm(input, ...@@ -176,7 +151,7 @@ def dynamic_lstm(input,
if not use_peepholes: if not use_peepholes:
bias_size[1] = 4 * size bias_size[1] = 4 * size
bias = helper.create_parameter( bias = helper.create_parameter(
attr=helper.bias_attr, shape=bias_size, dtype=dtype, suffix='b') attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
hidden = helper.create_tmp_variable(dtype) hidden = helper.create_tmp_variable(dtype)
cell = helper.create_tmp_variable(dtype) cell = helper.create_tmp_variable(dtype)
...@@ -208,6 +183,7 @@ def data(name, ...@@ -208,6 +183,7 @@ def data(name,
shape, shape,
append_batch_size=True, append_batch_size=True,
dtype='float32', dtype='float32',
lod_level=0,
type=core.VarDesc.VarType.LOD_TENSOR, type=core.VarDesc.VarType.LOD_TENSOR,
main_program=None, main_program=None,
startup_program=None, startup_program=None,
...@@ -221,6 +197,7 @@ def data(name, ...@@ -221,6 +197,7 @@ def data(name,
append_batch_size: Whether or not to append the data as a batch. append_batch_size: Whether or not to append the data as a batch.
dtype: The type of data : float32, float_16, int etc dtype: The type of data : float32, float_16, int etc
type: The output type. By default it is LOD_TENSOR. type: The output type. By default it is LOD_TENSOR.
lod_level(int): The LoD Level. 0 means the input data is not a sequence.
main_program: Name of the main program that calls this main_program: Name of the main program that calls this
startup_program: Name of the startup program startup_program: Name of the startup program
stop_gradient: A boolean that mentions whether gradient should flow. stop_gradient: A boolean that mentions whether gradient should flow.
...@@ -251,7 +228,8 @@ def data(name, ...@@ -251,7 +228,8 @@ def data(name,
shape=shape, shape=shape,
dtype=dtype, dtype=dtype,
type=type, type=type,
stop_gradient=stop_gradient) stop_gradient=stop_gradient,
lod_level=lod_level)
def create_tensor(dtype, name=None, main_program=None, startup_program=None): def create_tensor(dtype, name=None, main_program=None, startup_program=None):
...@@ -423,6 +401,7 @@ _create_op_func_('sigmoid') ...@@ -423,6 +401,7 @@ _create_op_func_('sigmoid')
_create_op_func_('scale') _create_op_func_('scale')
_create_op_func_('reshape') _create_op_func_('reshape')
_create_op_func_('transpose') _create_op_func_('transpose')
_create_op_func_('sigmoid_cross_entropy_with_logits')
def cast(x, dtype, main_program=None): def cast(x, dtype, main_program=None):
...@@ -471,19 +450,14 @@ def sums(input, out=None, main_program=None, startup_program=None): ...@@ -471,19 +450,14 @@ def sums(input, out=None, main_program=None, startup_program=None):
def linear_chain_crf(input, def linear_chain_crf(input,
label, label,
param_attr=None, param_attr=None,
param_initializer=None,
main_program=None, main_program=None,
startup_program=None): startup_program=None):
def _get_default_param_initializer():
return Xavier()
helper = LayerHelper('linear_chain_crf', **locals()) helper = LayerHelper('linear_chain_crf', **locals())
size = input.shape[1] size = input.shape[1]
transition = helper.create_parameter( transition = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr,
shape=[size + 2, size], shape=[size + 2, size],
dtype=helper.input_dtype(), dtype=helper.input_dtype())
initializer=param_initializer or _get_default_param_initializer())
alpha = helper.create_tmp_variable(dtype=helper.input_dtype()) alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype()) emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype()) transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
...@@ -646,9 +620,7 @@ def sequence_conv(input, ...@@ -646,9 +620,7 @@ def sequence_conv(input,
filter_stride=1, filter_stride=1,
padding=None, padding=None,
bias_attr=None, bias_attr=None,
bias_initializer=None,
param_attr=None, param_attr=None,
param_initializer=None,
act=None, act=None,
main_program=None, main_program=None,
startup_program=None): startup_program=None):
...@@ -658,30 +630,15 @@ def sequence_conv(input, ...@@ -658,30 +630,15 @@ def sequence_conv(input,
in the input parameters to the function. in the input parameters to the function.
""" """
def _get_default_bias_initializer():
return Constant()
def _get_default_param_initializer():
return Xavier()
# FIXME(dzh) : want to unify the argument of python layer # FIXME(dzh) : want to unify the argument of python layer
# function. So we ignore some unecessary attributes. # function. So we ignore some unecessary attributes.
# such as, padding_trainable, context_start. # such as, padding_trainable, context_start.
helper = LayerHelper('sequence_conv', **locals()) helper = LayerHelper('sequence_conv', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
if param_initializer is None:
param_initializer = _get_default_param_initializer()
if bias_initializer is None:
bias_initializer = _get_default_bias_initializer()
filter_shape = [filter_size * input.shape[1], num_filters] filter_shape = [filter_size * input.shape[1], num_filters]
filter = helper.create_parameter( filter = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr, shape=filter_shape, dtype=dtype)
shape=filter_shape,
dtype=dtype,
initializer=param_initializer)
pre_bias = helper.create_tmp_variable(dtype) pre_bias = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
...@@ -696,7 +653,7 @@ def sequence_conv(input, ...@@ -696,7 +653,7 @@ def sequence_conv(input,
'contextStart': -int(filter_size / 2), 'contextStart': -int(filter_size / 2),
'contextLength': filter_size 'contextLength': filter_size
}) })
pre_act = helper.append_bias_op(pre_bias, bias_initializer) pre_act = helper.append_bias_op(pre_bias)
return helper.append_activation(pre_act) return helper.append_activation(pre_act)
...@@ -707,9 +664,7 @@ def conv2d(input, ...@@ -707,9 +664,7 @@ def conv2d(input,
padding=None, padding=None,
groups=None, groups=None,
param_attr=None, param_attr=None,
param_initializer=None,
bias_attr=None, bias_attr=None,
bias_initializer=None,
act=None, act=None,
name=None, name=None,
main_program=None, main_program=None,
...@@ -722,13 +677,6 @@ def conv2d(input, ...@@ -722,13 +677,6 @@ def conv2d(input,
conv-2d output, if mentioned in the input parameters. conv-2d output, if mentioned in the input parameters.
""" """
def _get_default_bias_initializer():
return Constant()
def _get_default_param_initializer(filter_size, num_channels):
std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
return Normal(0.0, std, 0)
helper = LayerHelper('conv2d', **locals()) helper = LayerHelper('conv2d', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
...@@ -750,17 +698,16 @@ def conv2d(input, ...@@ -750,17 +698,16 @@ def conv2d(input,
input_shape = input.shape input_shape = input.shape
filter_shape = [num_filters, num_filter_channels] + filter_size filter_shape = [num_filters, num_filter_channels] + filter_size
if param_initializer is None: def _get_default_param_initializer():
param_initializer = _get_default_param_initializer(filter_size, std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
num_channels) return Normal(0.0, std, 0)
if bias_initializer is None:
bias_initializer = _get_default_bias_initializer()
filter = helper.create_parameter( filter = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr,
shape=filter_shape, shape=filter_shape,
dtype=dtype, dtype=dtype,
initializer=param_initializer) default_initializer=_get_default_param_initializer())
pre_bias = helper.create_tmp_variable(dtype) pre_bias = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
...@@ -774,8 +721,7 @@ def conv2d(input, ...@@ -774,8 +721,7 @@ def conv2d(input,
'paddings': padding, 'paddings': padding,
'groups': groups}) 'groups': groups})
pre_act = helper.append_bias_op( pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
pre_bias, bias_initializer, dim_start=1, dim_end=2)
return helper.append_activation(pre_act) return helper.append_activation(pre_act)
...@@ -876,12 +822,10 @@ def batch_norm(input, ...@@ -876,12 +822,10 @@ def batch_norm(input,
attr=helper.param_attr, attr=helper.param_attr,
shape=param_shape, shape=param_shape,
dtype=dtype, dtype=dtype,
initializer=Constant(1.0)) default_initializer=Constant(1.0))
bias = helper.create_parameter( bias = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
shape=param_shape,
dtype=dtype,
initializer=Constant(0.0))
mean = helper.create_global_variable( mean = helper.create_global_variable(
dtype=input.dtype, shape=param_shape, persistable=True) dtype=input.dtype, shape=param_shape, persistable=True)
...@@ -1495,7 +1439,7 @@ def increment(x, value=1.0, in_place=True, main_program=None): ...@@ -1495,7 +1439,7 @@ def increment(x, value=1.0, in_place=True, main_program=None):
type='increment', type='increment',
inputs={'X': [x]}, inputs={'X': [x]},
outputs={'Out': [out]}, outputs={'Out': [out]},
attrs={'step': value}) attrs={'step': float(value)})
return out return out
...@@ -1587,6 +1531,93 @@ def array_length(array, main_program=None): ...@@ -1587,6 +1531,93 @@ def array_length(array, main_program=None):
return tmp return tmp
def conv2d_transpose(input,
num_filters,
output_size=None,
filter_size=None,
padding=None,
stride=None,
param_attr=None,
main_program=None,
startup_program=None):
"""
The transpose of conv2d layer.
This layer is also known as deconvolution layer.
Args:
input(Variable): The input image with [N, C, H, W] format.
num_filters(int): The number of filter. It is as same as the output
image channel.
output_size(int|tuple|None): The output image size. If output size is a
tuple, it must contain two integers, (image_H, image_W). This
parameter only works when filter_size is None.
filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
it must contain two integers, (filter_size_H, filter_size_W).
Otherwise, the filter will be a square. None if use output size to
calculate filter_size
padding(int|tuple): The padding size. If padding is a tuple, it must
contain two integers, (padding_H, padding_W). Otherwise, the
padding_H = padding_W = padding.
stride(int|tuple): The stride size. If stride is a tuple, it must
contain two integers, (stride_H, stride_W). Otherwise, the
stride_H = stride_W = stride.
param_attr: Parameter Attribute.
main_program(Program): the main program
startup_program(Program): the startup program
Returns:
Variable: Output image.
"""
helper = LayerHelper("conv2d_transpose", **locals())
if not isinstance(input, Variable):
raise TypeError("Input of conv2d_transpose must be Variable")
input_channel = input.shape[1]
op_attr = dict()
if isinstance(padding, int):
op_attr['paddings'] = [padding, padding]
elif padding is not None:
op_attr['paddings'] = padding
if isinstance(stride, int):
op_attr['strides'] = stride
elif stride is not None:
op_attr['strides'] = stride
if filter_size is None:
if output_size is None:
raise ValueError("output_size must be set when filter_size is None")
if isinstance(output_size, int):
output_size = [output_size, output_size]
padding = op_attr.get('paddings', [0, 0])
stride = op_attr.get('strides', [1, 1])
h_in = input.shape[2]
w_in = input.shape[3]
filter_size_h = output_size[0] - (h_in - 1) * stride[0] + 2 * padding[0]
filter_size_w = output_size[1] - (w_in - 1) * stride[1] + 2 * padding[1]
filter_size = [filter_size_h, filter_size_w]
elif isinstance(filter_size, int):
filter_size = [filter_size, filter_size]
filter_shape = [input_channel, num_filters] + filter_size
img_filter = helper.create_parameter(
dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
out = helper.create_tmp_variable(dtype=input.dtype)
helper.append_op(
type='conv2d_transpose',
inputs={'Input': [input],
'Filter': [img_filter]},
outputs={'Output': out},
attrs=op_attr)
return out
class ConditionalBlockGuard(BlockGuard): class ConditionalBlockGuard(BlockGuard):
def __init__(self, block): def __init__(self, block):
if not isinstance(block, ConditionalBlock): if not isinstance(block, ConditionalBlock):
......
...@@ -197,8 +197,7 @@ class Optimizer(object): ...@@ -197,8 +197,7 @@ class Optimizer(object):
This method combines interface `append_backward_ops()` and This method combines interface `append_backward_ops()` and
`create_optimization_pass()` into one. `create_optimization_pass()` into one.
""" """
params_grads = append_backward_ops(loss, parameter_list, no_grad_set or params_grads = append_backward_ops(loss, parameter_list, no_grad_set)
set())
# Add regularization if any # Add regularization if any
params_grads = append_regularization_ops(params_grads) params_grads = append_regularization_ops(params_grads)
optimize_ops = self.create_optimization_pass(params_grads, loss, optimize_ops = self.create_optimization_pass(params_grads, loss,
......
from initializer import Initializer, Xavier, Constant
from regularizer import WeightDecayRegularizer
class ParamAttr(object):
def __init__(self,
name=None,
initializer=None,
learning_rate=1.0,
regularizer=None,
trainable=True):
self.name = name
self.initializer = initializer
self.learning_rate = learning_rate
self.regularizer = regularizer
self.trainable = trainable
def set_default_initializer(self, initializer):
if initializer is None:
if self.initializer is None:
raise ValueError("ParamAttr.initializer is not set")
return
if self.initializer is not None:
return
self.initializer = initializer
def set_default_param_initializer(self):
self.set_default_initializer(Xavier())
def set_default_bias_initializer(self):
self.set_default_initializer(Constant(0.0))
@staticmethod
def to_attr(arg):
if arg is None:
return ParamAttr()
elif isinstance(arg, ParamAttr):
return arg
elif isinstance(arg, str) or isinstance(arg, unicode):
return ParamAttr(name=arg)
elif isinstance(arg, Initializer):
return ParamAttr(initializer=arg)
elif isinstance(arg, WeightDecayRegularizer):
return ParamAttr(regularizer=arg)
elif isinstance(arg, bool):
return ParamAttr.to_attr(None) if arg else False
else:
raise TypeError("{0} cast to ParamAttr".format(type(arg)))
def to_kwargs(self, with_initializer=False):
kwargs = {
'name': self.name,
'learning_rate': self.learning_rate,
'regularizer': self.regularizer,
'trainable': self.trainable
}
if with_initializer:
kwargs['initializer'] = self.initializer
return kwargs
import paddle.v2.fluid.core as core
from contextlib import contextmanager
__all__ = ['CudaProfiler']
NVPROF_CONFIG = [
"gpustarttimestamp",
"gpuendtimestamp",
"gridsize3d",
"threadblocksize",
"streamid",
"enableonstart 0",
"conckerneltrace",
]
@contextmanager
def cuda_profiler(output_file, output_mode=None, config=None):
"""The CUDA profiler.
This fuctions is used to profile CUDA program by CUDA runtime application
programming interface. The profiling result will be written into
`output_file` with Key-Value pair format or Comma separated values format.
The user can set the output mode by `output_mode` argument and set the
counters/options for profiling by `config` argument. The default config
is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
Args:
output_file (string) : The output file name, the result will be
written into this file.
output_mode (string) : The output mode has Key-Value pair format and
Comma separated values format. It should be 'kvp' or 'csv'.
config (string) : The profiler options and counters can refer to
"Compute Command Line Profiler User Guide".
"""
if output_mode is None:
output_mode = 'csv'
if output_mode not in ['kvp', 'csv']:
raise ValueError("The output mode must be 'kvp' or 'csv'.")
config = NVPROF_CONFIG if config is None else config
core.nvprof_init(output_file, output_mode, config)
# Enables profiler collection by the active CUDA profiling tool.
core.nvprof_start()
yield
# Disables profiler collection.
core.nvprof_stop()
...@@ -22,6 +22,7 @@ train_reader = paddle.batch( ...@@ -22,6 +22,7 @@ train_reader = paddle.batch(
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CPUPlace() place = fluid.CPUPlace()
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -31,12 +32,8 @@ for pass_id in range(PASS_NUM): ...@@ -31,12 +32,8 @@ for pass_id in range(PASS_NUM):
fluid.io.save_persistables(exe, "./fit_a_line.model/") fluid.io.save_persistables(exe, "./fit_a_line.model/")
fluid.io.load_persistables(exe, "./fit_a_line.model/") fluid.io.load_persistables(exe, "./fit_a_line.model/")
for data in train_reader(): for data in train_reader():
x_data = np.array(map(lambda _: _[0], data)).astype("float32")
y_data = np.array(map(lambda _: _[1], data)).astype("float32")
avg_loss_value, = exe.run(fluid.default_main_program(), avg_loss_value, = exe.run(fluid.default_main_program(),
feed={'x': x_data, feed=feeder.feed(data),
'y': y_data},
fetch_list=[avg_cost]) fetch_list=[avg_cost])
if avg_loss_value[0] < 10.0: if avg_loss_value[0] < 10.0:
......
...@@ -69,8 +69,7 @@ def vgg16_bn_drop(input): ...@@ -69,8 +69,7 @@ def vgg16_bn_drop(input):
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=512, act=None) fc1 = fluid.layers.fc(input=drop, size=512, act=None)
reshape1 = fluid.layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1))) bn = fluid.layers.batch_norm(input=fc1, act='relu')
bn = fluid.layers.batch_norm(input=reshape1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=512, act=None) fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
return fc2 return fc2
...@@ -114,23 +113,14 @@ train_reader = paddle.batch( ...@@ -114,23 +113,14 @@ train_reader = paddle.batch(
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
accuracy.reset(exe) accuracy.reset(exe)
for data in train_reader(): for data in train_reader():
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
batch_size = 1
for i in y_data.shape:
batch_size = batch_size * i
y_data = y_data.reshape([batch_size, 1])
loss, acc = exe.run(fluid.default_main_program(), loss, acc = exe.run(fluid.default_main_program(),
feed={"pixel": img_data, feed=feeder.feed(data),
"label": y_data},
fetch_list=[avg_cost] + accuracy.metrics) fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
......
...@@ -28,23 +28,15 @@ def load_parameter(file_name, h, w): ...@@ -28,23 +28,15 @@ def load_parameter(file_name, h, w):
return np.fromfile(f, dtype=np.float32).reshape(h, w) return np.fromfile(f, dtype=np.float32).reshape(h, w)
def db_lstm(): def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
**ignored):
# 8 features # 8 features
word = fluid.layers.data(name='word_data', shape=[1], dtype='int64')
predicate = fluid.layers.data(name='verb_data', shape=[1], dtype='int64')
ctx_n2 = fluid.layers.data(name='ctx_n2_data', shape=[1], dtype='int64')
ctx_n1 = fluid.layers.data(name='ctx_n1_data', shape=[1], dtype='int64')
ctx_0 = fluid.layers.data(name='ctx_0_data', shape=[1], dtype='int64')
ctx_p1 = fluid.layers.data(name='ctx_p1_data', shape=[1], dtype='int64')
ctx_p2 = fluid.layers.data(name='ctx_p2_data', shape=[1], dtype='int64')
mark = fluid.layers.data(name='mark_data', shape=[1], dtype='int64')
predicate_embedding = fluid.layers.embedding( predicate_embedding = fluid.layers.embedding(
input=predicate, input=predicate,
size=[pred_len, word_dim], size=[pred_len, word_dim],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr={'name': 'vemb'}) param_attr='vemb')
mark_embedding = fluid.layers.embedding( mark_embedding = fluid.layers.embedding(
input=mark, input=mark,
...@@ -57,8 +49,8 @@ def db_lstm(): ...@@ -57,8 +49,8 @@ def db_lstm():
fluid.layers.embedding( fluid.layers.embedding(
size=[word_dict_len, word_dim], size=[word_dict_len, word_dim],
input=x, input=x,
param_attr={'name': embedding_name, param_attr=fluid.ParamAttr(
'trainable': False}) for x in word_input name=embedding_name, trainable=False)) for x in word_input
] ]
emb_layers.append(predicate_embedding) emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding) emb_layers.append(mark_embedding)
...@@ -120,13 +112,30 @@ def to_lodtensor(data, place): ...@@ -120,13 +112,30 @@ def to_lodtensor(data, place):
def main(): def main():
# define network topology # define network topology
feature_out = db_lstm() word = fluid.layers.data(
target = fluid.layers.data(name='target', shape=[1], dtype='int64') name='word_data', shape=[1], dtype='int64', lod_level=1)
predicate = fluid.layers.data(
name='verb_data', shape=[1], dtype='int64', lod_level=1)
ctx_n2 = fluid.layers.data(
name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
ctx_n1 = fluid.layers.data(
name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
ctx_0 = fluid.layers.data(
name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
ctx_p1 = fluid.layers.data(
name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
ctx_p2 = fluid.layers.data(
name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(
name='mark_data', shape=[1], dtype='int64', lod_level=1)
feature_out = db_lstm(**locals())
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)
crf_cost = fluid.layers.linear_chain_crf( crf_cost = fluid.layers.linear_chain_crf(
input=feature_out, input=feature_out,
label=target, label=target,
param_attr={"name": 'crfw', param_attr=fluid.ParamAttr(
"learning_rate": mix_hidden_lr}) name='crfw', learning_rate=mix_hidden_lr))
avg_cost = fluid.layers.mean(x=crf_cost) avg_cost = fluid.layers.mean(x=crf_cost)
# TODO(qiao) # TODO(qiao)
# 1. add crf_decode_layer and evaluator # 1. add crf_decode_layer and evaluator
...@@ -139,6 +148,11 @@ def main(): ...@@ -139,6 +148,11 @@ def main():
paddle.dataset.conll05.test(), buf_size=8192), paddle.dataset.conll05.test(), buf_size=8192),
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CPUPlace() place = fluid.CPUPlace()
feeder = fluid.DataFeeder(
feed_list=[
word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
],
place=place)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
...@@ -150,28 +164,8 @@ def main(): ...@@ -150,28 +164,8 @@ def main():
batch_id = 0 batch_id = 0
for pass_id in xrange(PASS_NUM): for pass_id in xrange(PASS_NUM):
for data in train_data(): for data in train_data():
word_data = to_lodtensor(map(lambda x: x[0], data), place)
ctx_n2_data = to_lodtensor(map(lambda x: x[1], data), place)
ctx_n1_data = to_lodtensor(map(lambda x: x[2], data), place)
ctx_0_data = to_lodtensor(map(lambda x: x[3], data), place)
ctx_p1_data = to_lodtensor(map(lambda x: x[4], data), place)
ctx_p2_data = to_lodtensor(map(lambda x: x[5], data), place)
verb_data = to_lodtensor(map(lambda x: x[6], data), place)
mark_data = to_lodtensor(map(lambda x: x[7], data), place)
target = to_lodtensor(map(lambda x: x[8], data), place)
outs = exe.run(fluid.default_main_program(), outs = exe.run(fluid.default_main_program(),
feed={ feed=feeder.feed(data),
'word_data': word_data,
'ctx_n2_data': ctx_n2_data,
'ctx_n1_data': ctx_n1_data,
'ctx_0_data': ctx_0_data,
'ctx_p1_data': ctx_p1_data,
'ctx_p2_data': ctx_p2_data,
'verb_data': verb_data,
'mark_data': mark_data,
'target': target
},
fetch_list=[avg_cost]) fetch_list=[avg_cost])
avg_cost_val = np.array(outs[0]) avg_cost_val = np.array(outs[0])
......
...@@ -37,20 +37,14 @@ train_reader = paddle.batch( ...@@ -37,20 +37,14 @@ train_reader = paddle.batch(
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
accuracy.reset(exe) accuracy.reset(exe)
for data in train_reader(): for data in train_reader():
img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([BATCH_SIZE, 1])
loss, acc = exe.run(fluid.default_main_program(), loss, acc = exe.run(fluid.default_main_program(),
feed={"pixel": img_data, feed=feeder.feed(data),
"label": y_data},
fetch_list=[avg_cost] + accuracy.metrics) fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" + print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" +
......
...@@ -6,24 +6,21 @@ import paddle.v2.fluid as fluid ...@@ -6,24 +6,21 @@ import paddle.v2.fluid as fluid
BATCH_SIZE = 128 BATCH_SIZE = 128
image = fluid.layers.data(name='x', shape=[784], dtype='float32') image = fluid.layers.data(name='x', shape=[784], dtype='float32')
param_attr = { regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
'name': None,
'regularization': fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
}
hidden1 = fluid.layers.fc(input=image, hidden1 = fluid.layers.fc(input=image,
size=128, size=128,
act='relu', act='relu',
param_attr=param_attr) param_attr=regularizer)
hidden2 = fluid.layers.fc(input=hidden1, hidden2 = fluid.layers.fc(input=hidden1,
size=64, size=64,
act='relu', act='relu',
param_attr=param_attr) param_attr=regularizer)
predict = fluid.layers.fc(input=hidden2, predict = fluid.layers.fc(input=hidden2,
size=10, size=10,
act='softmax', act='softmax',
param_attr=param_attr) param_attr=regularizer)
label = fluid.layers.data(name='y', shape=[1], dtype='int64') label = fluid.layers.data(name='y', shape=[1], dtype='int64')
...@@ -51,40 +48,22 @@ test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128) ...@@ -51,40 +48,22 @@ test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
PASS_NUM = 100 PASS_NUM = 100
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
accuracy.reset(exe) accuracy.reset(exe)
for data in train_reader(): for data in train_reader():
x_data = np.array(map(lambda x: x[0], data)).astype("float32") out, acc = exe.run(fluid.default_main_program(),
y_data = np.array(map(lambda x: x[1], data)).astype("int64") feed=feeder.feed(data),
y_data = np.expand_dims(y_data, axis=1)
tensor_x = fluid.LoDTensor()
tensor_x.set(x_data, place)
tensor_y = fluid.LoDTensor()
tensor_y.set(y_data, place)
outs = exe.run(fluid.default_main_program(),
feed={'x': tensor_x,
'y': tensor_y},
fetch_list=[avg_cost] + accuracy.metrics) fetch_list=[avg_cost] + accuracy.metrics)
out = np.array(outs[0])
acc = np.array(outs[1])
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
test_accuracy.reset(exe) test_accuracy.reset(exe)
for data in test_reader(): for data in test_reader():
x_data = np.array(map(lambda x: x[0], data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = np.expand_dims(y_data, axis=1)
out, acc = exe.run(inference_program, out, acc = exe.run(inference_program,
feed={'x': x_data, feed=feeder.feed(data),
'y': y_data},
fetch_list=[avg_cost] + test_accuracy.metrics) fetch_list=[avg_cost] + test_accuracy.metrics)
test_pass_acc = test_accuracy.eval(exe) test_pass_acc = test_accuracy.eval(exe)
......
...@@ -24,7 +24,7 @@ def get_usr_combined_features(): ...@@ -24,7 +24,7 @@ def get_usr_combined_features():
input=uid, input=uid,
dtype='float32', dtype='float32',
size=[USR_DICT_SIZE, 32], size=[USR_DICT_SIZE, 32],
param_attr={'name': 'user_table'}, param_attr='user_table',
is_sparse=IS_SPARSE) is_sparse=IS_SPARSE)
usr_fc = layers.fc(input=usr_emb, size=32) usr_fc = layers.fc(input=usr_emb, size=32)
...@@ -36,7 +36,7 @@ def get_usr_combined_features(): ...@@ -36,7 +36,7 @@ def get_usr_combined_features():
usr_gender_emb = layers.embedding( usr_gender_emb = layers.embedding(
input=usr_gender_id, input=usr_gender_id,
size=[USR_GENDER_DICT_SIZE, 16], size=[USR_GENDER_DICT_SIZE, 16],
param_attr={'name': 'gender_table'}, param_attr='gender_table',
is_sparse=IS_SPARSE) is_sparse=IS_SPARSE)
usr_gender_fc = layers.fc(input=usr_gender_emb, size=16) usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
...@@ -48,7 +48,7 @@ def get_usr_combined_features(): ...@@ -48,7 +48,7 @@ def get_usr_combined_features():
input=usr_age_id, input=usr_age_id,
size=[USR_AGE_DICT_SIZE, 16], size=[USR_AGE_DICT_SIZE, 16],
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr={'name': 'age_table'}) param_attr='age_table')
usr_age_fc = layers.fc(input=usr_age_emb, size=16) usr_age_fc = layers.fc(input=usr_age_emb, size=16)
...@@ -58,7 +58,7 @@ def get_usr_combined_features(): ...@@ -58,7 +58,7 @@ def get_usr_combined_features():
usr_job_emb = layers.embedding( usr_job_emb = layers.embedding(
input=usr_job_id, input=usr_job_id,
size=[USR_JOB_DICT_SIZE, 16], size=[USR_JOB_DICT_SIZE, 16],
param_attr={'name': 'job_table'}, param_attr='job_table',
is_sparse=IS_SPARSE) is_sparse=IS_SPARSE)
usr_job_fc = layers.fc(input=usr_job_emb, size=16) usr_job_fc = layers.fc(input=usr_job_emb, size=16)
...@@ -81,7 +81,7 @@ def get_mov_combined_features(): ...@@ -81,7 +81,7 @@ def get_mov_combined_features():
input=mov_id, input=mov_id,
dtype='float32', dtype='float32',
size=[MOV_DICT_SIZE, 32], size=[MOV_DICT_SIZE, 32],
param_attr={'name': 'movie_table'}, param_attr='movie_table',
is_sparse=IS_SPARSE) is_sparse=IS_SPARSE)
mov_fc = layers.fc(input=mov_emb, size=32) mov_fc = layers.fc(input=mov_emb, size=32)
......
...@@ -4,10 +4,8 @@ import paddle.v2 as paddle ...@@ -4,10 +4,8 @@ import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32): def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
data = fluid.layers.data(name="words", shape=[1], dtype="int64") hid_dim=32):
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim]) emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
conv_3 = fluid.nets.sequence_conv_pool( conv_3 = fluid.nets.sequence_conv_pool(
input=emb, input=emb,
...@@ -55,8 +53,11 @@ def main(): ...@@ -55,8 +53,11 @@ def main():
dict_dim = len(word_dict) dict_dim = len(word_dict)
class_dim = 2 class_dim = 2
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost, accuracy, acc_out = convolution_net( cost, accuracy, acc_out = convolution_net(
input_dim=dict_dim, class_dim=class_dim) data, label, input_dim=dict_dim, class_dim=class_dim)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -64,24 +65,15 @@ def main(): ...@@ -64,24 +65,15 @@ def main():
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in xrange(PASS_NUM): for pass_id in xrange(PASS_NUM):
accuracy.reset(exe) accuracy.reset(exe)
for data in train_data(): for data in train_data():
tensor_words = to_lodtensor(map(lambda x: x[0], data), place) cost_val, acc_val = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
label = np.array(map(lambda x: x[1], data)).astype("int64")
label = label.reshape([BATCH_SIZE, 1])
tensor_label = fluid.LoDTensor()
tensor_label.set(label, place)
cost_val, acc_val = exe.run(
fluid.default_main_program(),
feed={"words": tensor_words,
"label": tensor_label},
fetch_list=[cost, acc_out]) fetch_list=[cost, acc_out])
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
print("cost=" + str(cost_val) + " acc=" + str(acc_val) + print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
......
...@@ -3,14 +3,14 @@ import paddle.v2 as paddle ...@@ -3,14 +3,14 @@ import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
def stacked_lstm_net(input_dim, def stacked_lstm_net(data,
label,
input_dim,
class_dim=2, class_dim=2,
emb_dim=128, emb_dim=128,
hid_dim=512, hid_dim=512,
stacked_num=3): stacked_num=3):
assert stacked_num % 2 == 1 assert stacked_num % 2 == 1
data = fluid.layers.data(name="words", shape=[1], dtype="int64")
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim]) emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
# add bias attr # add bias attr
...@@ -65,8 +65,11 @@ def main(): ...@@ -65,8 +65,11 @@ def main():
dict_dim = len(word_dict) dict_dim = len(word_dict)
class_dim = 2 class_dim = 2
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost, accuracy, acc_out = stacked_lstm_net( cost, accuracy, acc_out = stacked_lstm_net(
input_dim=dict_dim, class_dim=class_dim) data, label, input_dim=dict_dim, class_dim=class_dim)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -74,24 +77,15 @@ def main(): ...@@ -74,24 +77,15 @@ def main():
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in xrange(PASS_NUM): for pass_id in xrange(PASS_NUM):
accuracy.reset(exe) accuracy.reset(exe)
for data in train_data(): for data in train_data():
tensor_words = to_lodtensor(map(lambda x: x[0], data), place) cost_val, acc_val = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
label = np.array(map(lambda x: x[1], data)).astype("int64")
label = label.reshape([BATCH_SIZE, 1])
tensor_label = fluid.LoDTensor()
tensor_label.set(label, place)
cost_val, acc_val = exe.run(
fluid.default_main_program(),
feed={"words": tensor_words,
"label": tensor_label},
fetch_list=[cost, acc_out]) fetch_list=[cost, acc_out])
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
print("cost=" + str(cost_val) + " acc=" + str(acc_val) + print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
......
...@@ -8,7 +8,8 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50): ...@@ -8,7 +8,8 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
name="words", name="words",
shape=[seq_len * batch_size, 1], shape=[seq_len * batch_size, 1],
append_batch_size=False, append_batch_size=False,
dtype="int64") dtype="int64",
lod_level=1)
label = fluid.layers.data( label = fluid.layers.data(
name="label", name="label",
shape=[batch_size, 1], shape=[batch_size, 1],
...@@ -21,6 +22,7 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50): ...@@ -21,6 +22,7 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
c_pre_init = fluid.layers.fill_constant( c_pre_init = fluid.layers.fill_constant(
dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0) dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
c_pre_init.stop_gradient = False
layer_1_out = fluid.layers.lstm( layer_1_out = fluid.layers.lstm(
emb, c_pre_init=c_pre_init, hidden_dim=emb_dim) emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
layer_1_out = fluid.layers.transpose(x=layer_1_out, axis=[1, 0, 2]) layer_1_out = fluid.layers.transpose(x=layer_1_out, axis=[1, 0, 2])
......
...@@ -23,25 +23,25 @@ embed_first = fluid.layers.embedding( ...@@ -23,25 +23,25 @@ embed_first = fluid.layers.embedding(
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr={'name': 'shared_w'}) param_attr='shared_w')
embed_second = fluid.layers.embedding( embed_second = fluid.layers.embedding(
input=second_word, input=second_word,
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr={'name': 'shared_w'}) param_attr='shared_w')
embed_third = fluid.layers.embedding( embed_third = fluid.layers.embedding(
input=third_word, input=third_word,
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr={'name': 'shared_w'}) param_attr='shared_w')
embed_forth = fluid.layers.embedding( embed_forth = fluid.layers.embedding(
input=forth_word, input=forth_word,
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr={'name': 'shared_w'}) param_attr='shared_w')
concat_embed = fluid.layers.concat( concat_embed = fluid.layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth], axis=1) input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
...@@ -57,28 +57,17 @@ train_reader = paddle.batch( ...@@ -57,28 +57,17 @@ train_reader = paddle.batch(
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(
# fix https://github.com/PaddlePaddle/Paddle/issues/5434 then remove feed_list=[first_word, second_word, third_word, forth_word, next_word],
# below exit line. place=place)
exit(0)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
for data in train_reader(): for data in train_reader():
input_data = [[data_idx[idx] for data_idx in data] for idx in xrange(5)]
input_data = map(lambda x: np.array(x).astype("int64"), input_data)
input_data = map(lambda x: np.expand_dims(x, axis=1), input_data)
avg_cost_np = exe.run(fluid.default_main_program(), avg_cost_np = exe.run(fluid.default_main_program(),
feed={ feed=feeder.feed(data),
'firstw': input_data[0],
'secondw': input_data[1],
'thirdw': input_data[2],
'forthw': input_data[3],
'nextw': input_data[4]
},
fetch_list=[avg_cost]) fetch_list=[avg_cost])
if avg_cost_np[0] < 10.0: if avg_cost_np[0] < 5.0:
exit(0) # if avg cost less than 10.0, we think our code is good. exit(0) # if avg cost less than 10.0, we think our code is good.
exit(1) exit(1)
import errno
import math
import os
import matplotlib
import numpy
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
NOISE_SIZE = 100
NUM_PASS = 1000
NUM_REAL_IMGS_IN_BATCH = 121
NUM_TRAIN_TIMES_OF_DG = 3
LEARNING_RATE = 2e-5
def D(x):
hidden = fluid.layers.fc(input=x,
size=200,
act='relu',
param_attr='D.w1',
bias_attr='D.b1')
logits = fluid.layers.fc(input=hidden,
size=1,
act=None,
param_attr='D.w2',
bias_attr='D.b2')
return logits
def G(x):
hidden = fluid.layers.fc(input=x,
size=200,
act='relu',
param_attr='G.w1',
bias_attr='G.b1')
img = fluid.layers.fc(input=hidden,
size=28 * 28,
act='tanh',
param_attr='G.w2',
bias_attr='G.b2')
return img
def plot(gen_data):
gen_data.resize(gen_data.shape[0], 28, 28)
n = int(math.ceil(math.sqrt(gen_data.shape[0])))
fig = plt.figure(figsize=(n, n))
gs = gridspec.GridSpec(n, n)
gs.update(wspace=0.05, hspace=0.05)
for i, sample in enumerate(gen_data):
ax = plt.subplot(gs[i])
plt.axis('off')
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_aspect('equal')
plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
return fig
def main():
try:
os.makedirs("./out")
except OSError as e:
if e.errno != errno.EEXIST:
raise
startup_program = fluid.Program()
d_program = fluid.Program()
dg_program = fluid.Program()
with fluid.program_guard(d_program, startup_program):
img = fluid.layers.data(name='img', shape=[784], dtype='float32')
d_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
x=D(img),
label=fluid.layers.data(
name='label', shape=[1], dtype='float32'))
d_loss = fluid.layers.mean(x=d_loss)
with fluid.program_guard(dg_program, startup_program):
noise = fluid.layers.data(
name='noise', shape=[NOISE_SIZE], dtype='float32')
g_img = G(x=noise)
g_program = dg_program.clone()
dg_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
x=D(g_img),
label=fluid.layers.fill_constant_batch_size_like(
input=noise, dtype='float32', shape=[-1, 1], value=1.0))
dg_loss = fluid.layers.mean(x=dg_loss)
opt = fluid.optimizer.Adam(learning_rate=LEARNING_RATE)
opt.minimize(loss=d_loss, startup_program=startup_program)
opt.minimize(
loss=dg_loss,
startup_program=startup_program,
parameter_list=[
p.name for p in g_program.global_block().all_parameters()
])
exe = fluid.Executor(fluid.CPUPlace())
exe.run(startup_program)
num_true = NUM_REAL_IMGS_IN_BATCH
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=60000),
batch_size=num_true)
for pass_id in range(NUM_PASS):
for batch_id, data in enumerate(train_reader()):
num_true = len(data)
n = numpy.random.uniform(
low=-1.0, high=1.0,
size=[num_true * NOISE_SIZE]).astype('float32').reshape(
[num_true, NOISE_SIZE])
generated_img = exe.run(g_program,
feed={'noise': n},
fetch_list={g_img})[0]
real_data = numpy.array(map(lambda x: x[0], data)).astype('float32')
real_data = real_data.reshape(num_true, 784)
total_data = numpy.concatenate([real_data, generated_img])
total_label = numpy.concatenate([
numpy.ones(
shape=[real_data.shape[0], 1], dtype='float32'),
numpy.zeros(
shape=[real_data.shape[0], 1], dtype='float32')
])
d_loss_np = exe.run(d_program,
feed={'img': total_data,
'label': total_label},
fetch_list={d_loss})[0]
for _ in xrange(NUM_TRAIN_TIMES_OF_DG):
n = numpy.random.uniform(
low=-1.0, high=1.0,
size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape(
[2 * num_true, NOISE_SIZE, 1, 1])
dg_loss_np = exe.run(dg_program,
feed={'noise': n},
fetch_list={dg_loss})[0]
print("Pass ID={0}, Batch ID={1}, D-Loss={2}, DG-Loss={3}".format(
pass_id, batch_id, d_loss_np, dg_loss_np))
# generate image each batch
fig = plot(generated_img)
plt.savefig(
'out/{0}.png'.format(str(pass_id).zfill(3)), bbox_inches='tight')
plt.close(fig)
if __name__ == '__main__':
main()
...@@ -3,7 +3,7 @@ import paddle.v2.fluid.core as core ...@@ -3,7 +3,7 @@ import paddle.v2.fluid.core as core
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward_ops
from paddle.v2.fluid.framework import g_main_program from paddle.v2.fluid.framework import default_main_program
import numpy import numpy
...@@ -66,7 +66,7 @@ class TestArrayReadWrite(unittest.TestCase): ...@@ -66,7 +66,7 @@ class TestArrayReadWrite(unittest.TestCase):
append_backward_ops(total_sum_scaled) append_backward_ops(total_sum_scaled)
g_vars = map(g_main_program.global_block().var, g_vars = map(default_main_program().global_block().var,
[each_x.name + "@GRAD" for each_x in x]) [each_x.name + "@GRAD" for each_x in x])
g_out = [ g_out = [
item.sum() item.sum()
......
...@@ -21,6 +21,13 @@ def get_backward_op(scope, op, no_grad_set): ...@@ -21,6 +21,13 @@ def get_backward_op(scope, op, no_grad_set):
def _reference_training(x, scale, offset, epsilon, data_format): def _reference_training(x, scale, offset, epsilon, data_format):
x_shape = x.shape
if len(x_shape) == 2:
if data_format == "NCHW":
x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
else:
x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
if data_format == "NCHW": if data_format == "NCHW":
n, c, h, w = x.shape n, c, h, w = x.shape
x_square = x * x x_square = x * x
...@@ -39,6 +46,8 @@ def _reference_training(x, scale, offset, epsilon, data_format): ...@@ -39,6 +46,8 @@ def _reference_training(x, scale, offset, epsilon, data_format):
offset_tile = np.reshape(offset, (1, c, 1, 1)) offset_tile = np.reshape(offset, (1, c, 1, 1))
offset_tile = np.reshape(offset_tile, (1, c, 1, 1)) offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
y = normalized * scale_tile + offset_tile y = normalized * scale_tile + offset_tile
if len(x_shape) == 2:
y = np.reshape(y, (y.shape[0], y.shape[1]))
return y, mean, var return y, mean, var
elif data_format == "NHWC": elif data_format == "NHWC":
x_square = x * x x_square = x * x
...@@ -48,7 +57,10 @@ def _reference_training(x, scale, offset, epsilon, data_format): ...@@ -48,7 +57,10 @@ def _reference_training(x, scale, offset, epsilon, data_format):
mean = x_sum / element_count mean = x_sum / element_count
var = x_square_sum / element_count - mean * mean var = x_square_sum / element_count - mean * mean
normalized = (x - mean) / np.sqrt(var + epsilon) normalized = (x - mean) / np.sqrt(var + epsilon)
return (normalized * scale + offset), mean, var y = normalized * scale + offset
if len(x_shape) == 2:
y = np.reshape(y, x_shape)
return y, mean, var
else: else:
raise ValueError("Unknown data order.") raise ValueError("Unknown data order.")
...@@ -65,6 +77,18 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format): ...@@ -65,6 +77,18 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
# (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon)) # (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
# transfer from (N, C, H, W) to (N, H, W, C) to simplify computation # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
x_shape = x.shape
if len(x_shape) == 2:
if data_format == "NCHW":
x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
grad_y = np.reshape(grad_y,
(grad_y.shape[0], grad_y.shape[1], 1, 1))
else:
x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
grad_y = np.reshape(grad_y,
(grad_y.shape[0], 1, 1, grad_y.shape[1]))
if data_format == "NCHW": if data_format == "NCHW":
x = np.transpose(x, (0, 2, 3, 1)) x = np.transpose(x, (0, 2, 3, 1))
grad_y = np.transpose(grad_y, (0, 2, 3, 1)) grad_y = np.transpose(grad_y, (0, 2, 3, 1))
...@@ -83,6 +107,9 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format): ...@@ -83,6 +107,9 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
grad_x = np.transpose(grad_x, (0, 3, 1, 2)) grad_x = np.transpose(grad_x, (0, 3, 1, 2))
x = np.transpose(x, (0, 3, 1, 2)) x = np.transpose(x, (0, 3, 1, 2))
grad_y = np.transpose(grad_y, (0, 3, 1, 2)) grad_y = np.transpose(grad_y, (0, 3, 1, 2))
if len(x_shape) == 2:
grad_x = np.reshape(grad_x, x_shape)
return grad_x, grad_scale, grad_offset return grad_x, grad_scale, grad_offset
...@@ -127,7 +154,7 @@ class TestBatchNormOp(OpTest): ...@@ -127,7 +154,7 @@ class TestBatchNormOp(OpTest):
momentum = 0.9 momentum = 0.9
# N, H, W, C: 2, 3, 4, 2 # N, H, W, C: 2, 3, 4, 2
n, h, w, c = 2, 3, 4, 2 n, h, w, c = 2, 3, 4, 5
x_shape = [n, h, w, c] x_shape = [n, h, w, c]
scale_shape = [c] scale_shape = [c]
...@@ -184,14 +211,17 @@ class TestBatchNormOp(OpTest): ...@@ -184,14 +211,17 @@ class TestBatchNormOp(OpTest):
print 'python: NHWC, NCHW, backward checking passed' print 'python: NHWC, NCHW, backward checking passed'
def test_forward_backward(self): def test_forward_backward(self):
def test_with_place(place, tensor_format): def test_with_place(place, tensor_format, shape):
# attr # attr
epsilon = 0.00001 epsilon = 0.00001
momentum = 0.9 momentum = 0.9
# N, H, W, C: 12, 3, 4, 2 if len(shape) == 2:
n, h, w, c = 2, 3, 4, 2 x_shape = shape
c = shape[1]
else:
# n, h, w, c = 2, 3, 4, 2
n, h, w, c = shape[0], shape[1], shape[2], shape[3]
if data_format == "NHWC": if data_format == "NHWC":
x_shape = [n, h, w, c] x_shape = [n, h, w, c]
elif data_format == "NCHW": elif data_format == "NCHW":
...@@ -219,6 +249,9 @@ class TestBatchNormOp(OpTest): ...@@ -219,6 +249,9 @@ class TestBatchNormOp(OpTest):
# for gradient test # for gradient test
# y_grad = np.ones(x_shape).astype(np.float32) # y_grad = np.ones(x_shape).astype(np.float32)
y_grad = np.zeros(x_shape).astype(np.float32) y_grad = np.zeros(x_shape).astype(np.float32)
if len(y_grad.shape) == 2:
y_grad[0, 0] = 1.
else:
y_grad[0, 0, 0, 0] = 1. y_grad[0, 0, 0, 0] = 1.
# y_grad = np.random.random_sample(x_shape).astype(np.float32) # y_grad = np.random.random_sample(x_shape).astype(np.float32)
x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad( x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
...@@ -313,7 +346,8 @@ class TestBatchNormOp(OpTest): ...@@ -313,7 +346,8 @@ class TestBatchNormOp(OpTest):
places.append(core.GPUPlace(0)) places.append(core.GPUPlace(0))
for place in places: for place in places:
for data_format in ["NCHW", "NHWC"]: for data_format in ["NCHW", "NHWC"]:
test_with_place(place, data_format) test_with_place(place, data_format, [2, 3, 4, 5])
test_with_place(place, data_format, [2, 3])
if __name__ == '__main__': if __name__ == '__main__':
......
import unittest import unittest
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.framework import g_startup_program, g_main_program from paddle.v2.fluid.framework import default_startup_program, default_main_program
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward_ops
import numpy import numpy
...@@ -19,7 +19,7 @@ class ConditionalBlock(unittest.TestCase): ...@@ -19,7 +19,7 @@ class ConditionalBlock(unittest.TestCase):
cpu = core.CPUPlace() cpu = core.CPUPlace()
exe = Executor(cpu) exe = Executor(cpu)
exe.run(g_startup_program) exe.run(default_startup_program())
x = numpy.random.random(size=(10, 1)).astype('float32') x = numpy.random.random(size=(10, 1)).astype('float32')
...@@ -29,7 +29,9 @@ class ConditionalBlock(unittest.TestCase): ...@@ -29,7 +29,9 @@ class ConditionalBlock(unittest.TestCase):
append_backward_ops(loss=loss) append_backward_ops(loss=loss)
outs = exe.run( outs = exe.run(
feed={'X': x}, feed={'X': x},
fetch_list=[g_main_program.block(0).var(data.name + "@GRAD")])[0] fetch_list=[
default_main_program().block(0).var(data.name + "@GRAD")
])[0]
print outs print outs
......
import paddle.v2.fluid as fluid
def test_converter():
img = fluid.layers.data(name='image', shape=[1, 28, 28])
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
result = feeder.feed([[[0] * 784, [9]], [[1] * 784, [1]]])
print(result)
if __name__ == '__main__':
test_converter()
import paddle.v2.fluid as fluid
import paddle.v2 as paddle
import unittest
import numpy
class TestDynRNN(unittest.TestCase):
def setUp(self):
self.word_dict = paddle.dataset.imdb.word_dict()
self.BATCH_SIZE = 100
self.train_data = paddle.batch(
paddle.dataset.imdb.train(self.word_dict),
batch_size=self.BATCH_SIZE)
def test_plain_while_op(self):
main_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
sentence = fluid.layers.data(
name='word', shape=[1], dtype='int64', lod_level=1)
sent_emb = fluid.layers.embedding(
input=sentence, size=[len(self.word_dict), 32], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='float32')
rank_table = fluid.layers.lod_rank_table(x=sent_emb)
sent_emb_array = fluid.layers.lod_tensor_to_array(
x=sent_emb, table=rank_table)
seq_len = fluid.layers.max_sequence_len(rank_table=rank_table)
i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
i.stop_gradient = False
boot_mem = fluid.layers.fill_constant_batch_size_like(
input=fluid.layers.array_read(
array=sent_emb_array, i=i),
value=0,
shape=[-1, 100],
dtype='float32')
boot_mem.stop_gradient = False
mem_array = fluid.layers.array_write(x=boot_mem, i=i)
cond = fluid.layers.less_than(x=i, y=seq_len)
cond.stop_gradient = False
while_op = fluid.layers.While(cond=cond)
out = fluid.layers.create_array(dtype='float32')
with while_op.block():
mem = fluid.layers.array_read(array=mem_array, i=i)
ipt = fluid.layers.array_read(array=sent_emb_array, i=i)
mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table)
hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh')
fluid.layers.array_write(x=hidden, i=i, array=out)
fluid.layers.increment(x=i, in_place=True)
fluid.layers.array_write(x=hidden, i=i, array=mem_array)
fluid.layers.less_than(x=i, y=seq_len, cond=cond)
all_timesteps = fluid.layers.array_to_lod_tensor(
x=out, table=rank_table)
last = fluid.layers.sequence_pool(
input=all_timesteps, pool_type='last')
logits = fluid.layers.fc(input=last, size=1, act=None)
loss = fluid.layers.sigmoid_cross_entropy_with_logits(
x=logits, label=label)
loss = fluid.layers.mean(x=loss)
sgd = fluid.optimizer.SGD(1e-4)
sgd.minimize(loss=loss)
cpu = fluid.CPUPlace()
exe = fluid.Executor(cpu)
exe.run(startup_program)
feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
data = next(self.train_data())
val = exe.run(main_program, feed=feeder.feed(data),
fetch_list=[loss])[0]
self.assertEqual((1, ), val.shape)
print(val)
self.assertFalse(numpy.isnan(val))
if __name__ == '__main__':
unittest.main()
import unittest import unittest
from paddle.v2.fluid.layers import mul, data, sequence_pool
import numpy
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.framework import g_main_program from paddle.v2.fluid.layers import mul, data
import numpy
class TestExecutor(unittest.TestCase): class TestExecutor(unittest.TestCase):
...@@ -19,10 +20,7 @@ class TestExecutor(unittest.TestCase): ...@@ -19,10 +20,7 @@ class TestExecutor(unittest.TestCase):
a_np = numpy.random.random((100, 784)).astype('float32') a_np = numpy.random.random((100, 784)).astype('float32')
b_np = numpy.random.random((784, 100)).astype('float32') b_np = numpy.random.random((784, 100)).astype('float32')
exe = Executor(place) exe = Executor(place)
outs = exe.run(g_main_program, outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
feed={'a': a_np,
'b': b_np},
fetch_list=[out])
out = outs[0] out = outs[0]
self.assertEqual((100, 100), out.shape) self.assertEqual((100, 100), out.shape)
self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np))) self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
......
import unittest
import numpy as np
from op_test import OpTest
class TestHingeLossOp(OpTest):
def setUp(self):
self.op_type = 'hinge_loss'
samples_num = 64
logits = np.random.uniform(-10, 10, (samples_num, 1)).astype('float32')
labels = np.random.randint(0, 2, (samples_num, 1)).astype('float32')
self.inputs = {
'Logits': logits,
'Labels': labels,
}
loss = np.maximum(1.0 - (2 * labels - 1) * logits, 0)
self.outputs = {'Loss': loss}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['Logits'], 'Loss', max_relative_error=0.008)
if __name__ == '__main__':
unittest.main()
import unittest import unittest
import paddle.v2.fluid.layers as layers import paddle.v2.fluid as fluid
import paddle.v2.fluid.nets as nets import paddle.v2.fluid.nets as nets
from paddle.v2.fluid.framework import Program from paddle.v2.fluid.framework import Program
...@@ -29,27 +29,35 @@ class TestLayer(unittest.TestCase): ...@@ -29,27 +29,35 @@ class TestLayer(unittest.TestCase):
def test_batch_norm_layer(self): def test_batch_norm_layer(self):
main_program = Program() main_program = Program()
startup_program = Program() startup_program = Program()
images = layers.data( images = fluid.layers.data(
name='pixel', name='pixel',
shape=[3, 48, 48], shape=[3, 48, 48],
dtype='float32', dtype='float32',
main_program=main_program) main_program=main_program)
layers.batch_norm( hidden1 = fluid.layers.batch_norm(
input=images, input=images,
main_program=main_program, main_program=main_program,
startup_program=startup_program) startup_program=startup_program)
hidden2 = fluid.layers.fc(input=hidden1,
size=128,
act='relu',
main_program=main_program)
hidden3 = fluid.layers.batch_norm(
input=hidden2,
main_program=main_program,
startup_program=startup_program)
# print str(main_program) print str(main_program)
def test_dropout_layer(self): def test_dropout_layer(self):
main_program = Program() main_program = Program()
startup_program = Program() startup_program = Program()
images = layers.data( images = fluid.layers.data(
name='pixel', name='pixel',
shape=[3, 48, 48], shape=[3, 48, 48],
dtype='float32', dtype='float32',
main_program=main_program) main_program=main_program)
layers.dropout( fluid.layers.dropout(
x=images, x=images,
dropout_prob=0.5, dropout_prob=0.5,
main_program=main_program, main_program=main_program,
...@@ -61,7 +69,7 @@ class TestLayer(unittest.TestCase): ...@@ -61,7 +69,7 @@ class TestLayer(unittest.TestCase):
main_program = Program() main_program = Program()
startup_program = Program() startup_program = Program()
images = layers.data( images = fluid.layers.data(
name='pixel', name='pixel',
shape=[3, 48, 48], shape=[3, 48, 48],
dtype='float32', dtype='float32',
...@@ -77,19 +85,19 @@ class TestLayer(unittest.TestCase): ...@@ -77,19 +85,19 @@ class TestLayer(unittest.TestCase):
def test_elementwise_add_with_act(self): def test_elementwise_add_with_act(self):
main_program = Program() main_program = Program()
startup_program = Program() startup_program = Program()
image1 = layers.data( image1 = fluid.layers.data(
name='pixel1', name='pixel1',
shape=[3, 48, 48], shape=[3, 48, 48],
dtype='float32', dtype='float32',
main_program=main_program, main_program=main_program,
startup_program=startup_program) startup_program=startup_program)
image2 = layers.data( image2 = fluid.layers.data(
name='pixel2', name='pixel2',
shape=[3, 48, 48], shape=[3, 48, 48],
dtype='float32', dtype='float32',
main_program=main_program, main_program=main_program,
startup_program=startup_program) startup_program=startup_program)
out = layers.elementwise_add( out = fluid.layers.elementwise_add(
x=image1, x=image1,
y=image2, y=image2,
act='relu', act='relu',
......
from __future__ import print_function
import unittest import unittest
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets import paddle.v2.fluid.nets as nets
from paddle.v2.fluid.framework import Program from paddle.v2.fluid.framework import Program, program_guard
class TestBook(unittest.TestCase): class TestBook(unittest.TestCase):
def test_fit_a_line(self): def test_fit_a_line(self):
program = Program() program = Program()
x = layers.data( with program_guard(program, startup_program=Program()):
name='x', shape=[13], dtype='float32', main_program=program) x = layers.data(name='x', shape=[13], dtype='float32')
y_predict = layers.fc(input=x, size=1, act=None, main_program=program) y_predict = layers.fc(input=x, size=1, act=None)
y = layers.data(name='y', shape=[1], dtype='float32')
y = layers.data( cost = layers.square_error_cost(input=y_predict, label=y)
name='y', shape=[1], dtype='float32', main_program=program) avg_cost = layers.mean(x=cost)
cost = layers.square_error_cost(
input=y_predict, label=y, main_program=program)
avg_cost = layers.mean(x=cost, main_program=program)
self.assertIsNotNone(avg_cost) self.assertIsNotNone(avg_cost)
program.append_backward(avg_cost) program.append_backward(avg_cost)
print str(program) print(str(program))
def test_recognize_digits_mlp(self): def test_recognize_digits_mlp(self):
program = Program() program = Program()
with program_guard(program, startup_program=Program()):
# Change g_program, so the rest layers use `g_program` # Change g_program, so the rest layers use `g_program`
images = layers.data( images = layers.data(name='pixel', shape=[784], dtype='float32')
name='pixel', shape=[784], dtype='float32', main_program=program) label = layers.data(name='label', shape=[1], dtype='int32')
label = layers.data( hidden1 = layers.fc(input=images, size=128, act='relu')
name='label', shape=[1], dtype='int32', main_program=program) hidden2 = layers.fc(input=hidden1, size=64, act='relu')
hidden1 = layers.fc(input=images, predict = layers.fc(input=hidden2, size=10, act='softmax')
size=128, cost = layers.cross_entropy(input=predict, label=label)
act='relu', avg_cost = layers.mean(x=cost)
main_program=program)
hidden2 = layers.fc(input=hidden1,
size=64,
act='relu',
main_program=program)
predict = layers.fc(input=hidden2,
size=10,
act='softmax',
main_program=program)
cost = layers.cross_entropy(
input=predict, label=label, main_program=program)
avg_cost = layers.mean(x=cost, main_program=program)
self.assertIsNotNone(avg_cost) self.assertIsNotNone(avg_cost)
print str(program) print(str(program))
def test_simple_conv2d(self): def test_simple_conv2d(self):
program = Program() program = Program()
images = layers.data( with program_guard(program, startup_program=Program()):
name='pixel', images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32')
shape=[3, 48, 48], layers.conv2d(input=images, num_filters=3, filter_size=[4, 4])
dtype='int32',
main_program=program)
layers.conv2d(
input=images,
num_filters=3,
filter_size=[4, 4],
main_program=program)
print str(program) print(str(program))
def test_recognize_digits_conv(self): def test_conv2d_transpose(self):
program = Program() program = Program()
with program_guard(program):
img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
layers.conv2d_transpose(input=img, num_filters=10, output_size=28)
print(str(program))
def test_recognize_digits_conv(self):
program = Program()
with program_guard(program, startup_program=Program()):
images = layers.data( images = layers.data(
name='pixel', name='pixel', shape=[1, 28, 28], dtype='float32')
shape=[1, 28, 28], label = layers.data(name='label', shape=[1], dtype='int32')
dtype='float32',
main_program=program)
label = layers.data(
name='label', shape=[1], dtype='int32', main_program=program)
conv_pool_1 = nets.simple_img_conv_pool( conv_pool_1 = nets.simple_img_conv_pool(
input=images, input=images,
filter_size=5, filter_size=5,
num_filters=2, num_filters=2,
pool_size=2, pool_size=2,
pool_stride=2, pool_stride=2,
act="relu", act="relu")
main_program=program)
conv_pool_2 = nets.simple_img_conv_pool( conv_pool_2 = nets.simple_img_conv_pool(
input=conv_pool_1, input=conv_pool_1,
filter_size=5, filter_size=5,
num_filters=4, num_filters=4,
pool_size=2, pool_size=2,
pool_stride=2, pool_stride=2,
act="relu", act="relu")
main_program=program)
predict = layers.fc(input=conv_pool_2, predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
size=10, cost = layers.cross_entropy(input=predict, label=label)
act="softmax", avg_cost = layers.mean(x=cost)
main_program=program)
cost = layers.cross_entropy(
input=predict, label=label, main_program=program)
avg_cost = layers.mean(x=cost, main_program=program)
program.append_backward(avg_cost) program.append_backward(avg_cost)
print str(program) print(str(program))
def test_word_embedding(self): def test_word_embedding(self):
program = Program() program = Program()
with program_guard(program, startup_program=Program()):
dict_size = 10000 dict_size = 10000
embed_size = 32 embed_size = 32
first_word = layers.data( first_word = layers.data(name='firstw', shape=[1], dtype='int64')
name='firstw', shape=[1], dtype='int64', main_program=program) second_word = layers.data(name='secondw', shape=[1], dtype='int64')
second_word = layers.data( third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
name='secondw', shape=[1], dtype='int64', main_program=program) forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
third_word = layers.data( next_word = layers.data(name='nextw', shape=[1], dtype='int64')
name='thirdw', shape=[1], dtype='int64', main_program=program)
forth_word = layers.data(
name='forthw', shape=[1], dtype='int64', main_program=program)
next_word = layers.data(
name='nextw', shape=[1], dtype='int64', main_program=program)
embed_first = layers.embedding( embed_first = layers.embedding(
input=first_word, input=first_word,
size=[dict_size, embed_size], size=[dict_size, embed_size],
dtype='float32', dtype='float32',
param_attr={'name': 'shared_w'}, param_attr='shared_w')
main_program=program)
embed_second = layers.embedding( embed_second = layers.embedding(
input=second_word, input=second_word,
size=[dict_size, embed_size], size=[dict_size, embed_size],
dtype='float32', dtype='float32',
param_attr={'name': 'shared_w'}, param_attr='shared_w')
main_program=program)
embed_third = layers.embedding( embed_third = layers.embedding(
input=third_word, input=third_word,
size=[dict_size, embed_size], size=[dict_size, embed_size],
dtype='float32', dtype='float32',
param_attr={'name': 'shared_w'}, param_attr='shared_w')
main_program=program)
embed_forth = layers.embedding( embed_forth = layers.embedding(
input=forth_word, input=forth_word,
size=[dict_size, embed_size], size=[dict_size, embed_size],
dtype='float32', dtype='float32',
param_attr={'name': 'shared_w'}, param_attr='shared_w')
main_program=program)
concat_embed = layers.concat( concat_embed = layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth], input=[embed_first, embed_second, embed_third, embed_forth],
axis=1, axis=1)
main_program=program)
hidden1 = layers.fc(input=concat_embed, hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid')
size=256,
act='sigmoid',
main_program=program)
predict_word = layers.fc(input=hidden1, predict_word = layers.fc(input=hidden1,
size=dict_size, size=dict_size,
act='softmax', act='softmax')
main_program=program) cost = layers.cross_entropy(input=predict_word, label=next_word)
cost = layers.cross_entropy( avg_cost = layers.mean(x=cost)
input=predict_word, label=next_word, main_program=program)
avg_cost = layers.mean(x=cost, main_program=program)
self.assertIsNotNone(avg_cost) self.assertIsNotNone(avg_cost)
print str(program) print(str(program))
def test_linear_chain_crf(self): def test_linear_chain_crf(self):
program = Program() program = Program()
with program_guard(program, startup_program=Program()):
images = layers.data(name='pixel', shape=[784], dtype='float32')
label = layers.data(name='label', shape=[1], dtype='int32')
hidden = layers.fc(input=images, size=128)
crf = layers.linear_chain_crf(input=hidden, label=label)
self.assertNotEqual(crf, None)
# Change g_program, so the rest layers use `g_program` print(str(program))
images = layers.data(
name='pixel', shape=[784], dtype='float32', main_program=program) def test_sigmoid_cross_entropy(self):
label = layers.data( program = Program()
name='label', shape=[1], dtype='int32', main_program=program) with program_guard(program):
hidden = layers.fc(input=images, size=128, main_program=program) dat = layers.data(name='data', shape=[10], dtype='float32')
crf = layers.linear_chain_crf( lbl = layers.data(name='label', shape=[10], dtype='float32')
input=hidden, label=label, main_program=program) self.assertIsNotNone(
layers.sigmoid_cross_entropy_with_logits(
print str(program) x=dat, label=lbl))
print(str(program))
if __name__ == '__main__': if __name__ == '__main__':
......
from paddle.v2.fluid.layers import lod_rank_table, data from paddle.v2.fluid.layers import lod_rank_table, data
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.framework import g_main_program
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import numpy import numpy
import unittest import unittest
...@@ -18,7 +17,7 @@ class TestLoDRankTable(unittest.TestCase): ...@@ -18,7 +17,7 @@ class TestLoDRankTable(unittest.TestCase):
tensor = core.LoDTensor() tensor = core.LoDTensor()
tensor.set(numpy.random.random(size=(17, 100)), cpu) tensor.set(numpy.random.random(size=(17, 100)), cpu)
tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]]) tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
exe.run(g_main_program, scope=scope, feed={'x': tensor}) exe.run(scope=scope, feed={'x': tensor})
var = scope.find_var(rank_table.name) var = scope.find_var(rank_table.name)
table = var.get_lod_rank_table() table = var.get_lod_rank_table()
self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items()) self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
......
import unittest
import numpy as np
from op_test import OpTest
class TestLogLossOp(OpTest):
def setUp(self):
self.op_type = 'log_loss'
samples_num = 32
predicted = np.random.uniform(0.1, 1.0,
(samples_num, 1)).astype("float32")
labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32")
epsilon = 1e-4
self.inputs = {
'Predicted': predicted,
'Labels': labels,
}
self.attrs = {'epsilon': epsilon}
loss = -labels * np.log(predicted + epsilon) - (
1 - labels) * np.log(1 - predicted + epsilon)
self.outputs = {'Loss': loss}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03)
if __name__ == '__main__':
unittest.main()
import unittest
import numpy as np
from op_test import OpTest
def nce(input, weight, bias, sample_weight, labels, num_classes,
num_sample_class):
samples = []
sample_labels = []
batch_size = input.shape[0]
num_true_class = labels.shape[1]
for i in range(batch_size):
w = 1 if sample_weight is None else sample_weight[i]
for label in labels[i]:
samples.append((i, label, True, w))
sample_labels.append(label)
for num in range(num_sample_class):
samples.append((i, num, False, w))
sample_labels.append(num)
# forward bias
sample_out = np.zeros(len(samples)).astype(np.float32)
if bias is not None:
for i in range(len(samples)):
sample_out[i] = bias[samples[i][1]]
# forward weight
for i in range(len(samples)):
sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]])
# forward activation
sample_out = 1.0 / (1.0 + np.exp(-sample_out))
# forward cost
out = np.zeros(batch_size).astype(np.float32)
b = 1.0 / num_classes * num_sample_class
for i in range(len(samples)):
o = sample_out[i]
cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b))
out[samples[i][0]] += cost * samples[i][3]
return (out[:, np.newaxis], np.array(sample_out).reshape(
batch_size, num_sample_class + num_true_class),
np.array(sample_labels).reshape(batch_size,
num_sample_class + num_true_class))
class TestNCE(OpTest):
def generate_data(self, dim, batch_size, num_classes, num_true_class,
num_neg_samples):
input = np.random.randn(batch_size, dim).astype(np.float32)
weight = np.random.randn(num_classes, dim).astype(np.float32)
bias = np.random.randn(num_classes).astype(np.float32)
sample_weight = np.random.randn(batch_size).astype(np.float32)
labels = np.random.randint(0, num_classes, (batch_size, num_true_class))
self.attrs = {
'num_total_classes': num_classes,
'num_neg_samples': num_neg_samples,
'custom_neg_classes': range(num_neg_samples)
}
self.inputs = {
'Input': input,
'Label': labels,
'Weight': weight,
'Bias': bias,
'SampleWeight': sample_weight
}
def set_data(self):
self.generate_data(5, 5, 4, 1, 2)
def compute(self):
out = nce(self.inputs['Input'], self.inputs['Weight'],
self.inputs['Bias'], self.inputs['SampleWeight'],
self.inputs['Label'], self.attrs['num_total_classes'],
self.attrs['num_neg_samples'])
self.outputs = {
'Cost': out[0],
'SampleLogits': out[1],
'SampleLabels': out[2]
}
def setUp(self):
self.op_type = 'nce'
self.set_data()
self.compute()
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(
["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02)
class TestNCECase1(TestNCE):
def set_data(self):
self.generate_data(10, 20, 10, 2, 5)
if __name__ == '__main__':
unittest.main()
import unittest import unittest
from paddle.v2.fluid.framework import Variable, Program, g_main_program
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.framework import Program, default_startup_program
main_program = default_startup_program()
class TestOperator(unittest.TestCase): class TestOperator(unittest.TestCase):
def test_error_type(self): def test_error_type(self):
block = g_main_program.create_block() block = main_program.create_block()
try: try:
block.append_op() block.append_op()
self.assertFail() self.assertFail()
......
import unittest import unittest
from paddle.v2.fluid.framework import g_main_program from paddle.v2.fluid.framework import default_main_program
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
import paddle.v2.fluid.io as io import paddle.v2.fluid.io as io
from paddle.v2.fluid.initializer import ConstantInitializer from paddle.v2.fluid.initializer import ConstantInitializer
import numpy as np import numpy as np
main_program = default_main_program()
class TestParameter(unittest.TestCase): class TestParameter(unittest.TestCase):
def test_param(self): def test_param(self):
shape = [784, 100] shape = [784, 100]
val = 1.0625 val = 1.0625
b = g_main_program.global_block() b = main_program.global_block()
param = b.create_parameter( param = b.create_parameter(
name='fc.w', name='fc.w',
shape=shape, shape=shape,
...@@ -23,9 +25,9 @@ class TestParameter(unittest.TestCase): ...@@ -23,9 +25,9 @@ class TestParameter(unittest.TestCase):
self.assertEqual(core.DataType.FP32, param.dtype) self.assertEqual(core.DataType.FP32, param.dtype)
self.assertEqual(0, param.block.idx) self.assertEqual(0, param.block.idx)
exe = Executor(core.CPUPlace()) exe = Executor(core.CPUPlace())
p = exe.run(g_main_program, fetch_list=[param])[0] p = exe.run(main_program, fetch_list=[param])[0]
self.assertTrue(np.allclose(p, np.ones(shape) * val)) self.assertTrue(np.allclose(p, np.ones(shape) * val))
p = io.get_parameter_value_by_name('fc.w', exe, g_main_program) p = io.get_parameter_value_by_name('fc.w', exe, main_program)
self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val)) self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
......
import unittest
import numpy as np
import paddle.v2.fluid as fluid
import paddle.v2.fluid.profiler as profiler
import paddle.v2.fluid.layers as layers
class TestProfiler(unittest.TestCase):
def test_nvprof(self):
if not fluid.core.is_compile_gpu():
return
epoc = 8
dshape = [4, 3, 28, 28]
data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
place = fluid.GPUPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
for i in range(epoc):
input = np.random.random(dshape).astype('float32')
exe.run(fluid.default_main_program(), feed={'data': input})
if __name__ == '__main__':
unittest.main()
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from paddle.v2.fluid.framework import Program from paddle.v2.fluid.framework import Program, default_main_program
from paddle.v2.fluid.framework import g_main_program
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
main_program = default_main_program()
class TestProgram(unittest.TestCase): class TestProgram(unittest.TestCase):
def test_program(self): def test_program(self):
b = g_main_program.current_block() b = main_program.current_block()
self.assertEqual(-1, b.parent_idx) self.assertEqual(-1, b.parent_idx)
self.assertEqual(0, b.idx) self.assertEqual(0, b.idx)
b = g_main_program.create_block() b = main_program.create_block()
self.assertEqual(1, b.idx) self.assertEqual(1, b.idx)
self.assertEqual(0, b.parent_idx) self.assertEqual(0, b.parent_idx)
b = g_main_program.create_block() b = main_program.create_block()
self.assertEqual(2, b.idx) self.assertEqual(2, b.idx)
self.assertEqual(1, b.parent_idx) self.assertEqual(1, b.parent_idx)
g_main_program.rollback() main_program.rollback()
b = g_main_program.current_block() b = main_program.current_block()
self.assertEqual(1, b.idx) self.assertEqual(1, b.idx)
self.assertEqual(0, b.parent_idx) self.assertEqual(0, b.parent_idx)
b = g_main_program.create_block() b = main_program.create_block()
self.assertEqual(3, b.idx) self.assertEqual(3, b.idx)
self.assertEqual(1, b.parent_idx) self.assertEqual(1, b.parent_idx)
g_main_program.rollback() main_program.rollback()
b = g_main_program.current_block() b = main_program.current_block()
self.assertEqual(1, b.idx) self.assertEqual(1, b.idx)
self.assertEqual(0, b.parent_idx) self.assertEqual(0, b.parent_idx)
......
...@@ -271,12 +271,12 @@ class RecurrentOpTest2(RecurrentOpTest1): ...@@ -271,12 +271,12 @@ class RecurrentOpTest2(RecurrentOpTest1):
temp_l = layers.fc(input=x_t, temp_l = layers.fc(input=x_t,
size=self.input_dim, size=self.input_dim,
param_attr={'name': 'W'}, param_attr='W',
bias_attr=False, bias_attr=False,
**self.p_info) **self.p_info)
temp_r = layers.fc(input=h_pre, temp_r = layers.fc(input=h_pre,
size=self.input_dim, size=self.input_dim,
param_attr={'name': 'U'}, param_attr='U',
bias_attr=False, bias_attr=False,
**self.p_info) **self.p_info)
...@@ -454,4 +454,6 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1): ...@@ -454,4 +454,6 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
if __name__ == '__main__': if __name__ == '__main__':
# FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
exit(0)
unittest.main() unittest.main()
...@@ -3,9 +3,11 @@ import paddle.v2.fluid.core as core ...@@ -3,9 +3,11 @@ import paddle.v2.fluid.core as core
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward_ops
from paddle.v2.fluid.framework import g_main_program from paddle.v2.fluid.framework import default_main_program
import numpy import numpy
main_program = default_main_program()
class TestShrinkRNNMemory(unittest.TestCase): class TestShrinkRNNMemory(unittest.TestCase):
def test_shrink_rnn_memory(self): def test_shrink_rnn_memory(self):
...@@ -36,7 +38,7 @@ class TestShrinkRNNMemory(unittest.TestCase): ...@@ -36,7 +38,7 @@ class TestShrinkRNNMemory(unittest.TestCase):
append_backward_ops(loss=mem3_mean) append_backward_ops(loss=mem3_mean)
x_grad = exe.run( x_grad = exe.run(
feed={'x': tensor}, feed={'x': tensor},
fetch_list=[g_main_program.global_block().var('x@GRAD')])[0] fetch_list=[main_program.global_block().var('x@GRAD')])[0]
self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1) self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1)
......
...@@ -2,11 +2,12 @@ import numpy as np ...@@ -2,11 +2,12 @@ import numpy as np
from op_test import OpTest from op_test import OpTest
from scipy.special import logit from scipy.special import logit
from scipy.special import expit from scipy.special import expit
import unittest
class TestSigmoidCrossEntropyWithLogitsOp1(OpTest): class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
'''Test sigmoid_cross_entropy_with_logit_op with binary labels """Test sigmoid_cross_entropy_with_logit_op with binary label
''' """
def setUp(self): def setUp(self):
self.op_type = "sigmoid_cross_entropy_with_logits" self.op_type = "sigmoid_cross_entropy_with_logits"
...@@ -16,16 +17,16 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest): ...@@ -16,16 +17,16 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
'X': logit( 'X': logit(
np.random.uniform(0, 1, (batch_size, num_classes)) np.random.uniform(0, 1, (batch_size, num_classes))
.astype("float32")), .astype("float32")),
'Labels': np.random.randint(0, 2, (batch_size, num_classes)) 'Label': np.random.randint(0, 2, (batch_size, num_classes))
.astype("float32") .astype("float32")
} }
# Fw Pass is implemented as elementwise sigmoid followed by # Fw Pass is implemented as elementwise sigmoid followed by
# elementwise logistic loss # elementwise logistic loss
# Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X)) # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
sigmoid_X = expit(self.inputs['X']) sigmoid_X = expit(self.inputs['X'])
term1 = self.inputs['Labels'] * np.log(sigmoid_X) term1 = self.inputs['Label'] * np.log(sigmoid_X)
term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X) term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
self.outputs = {'Out': -term1 - term2} self.outputs = {'Out': -term1 - term2}
def test_check_output(self): def test_check_output(self):
...@@ -36,8 +37,8 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest): ...@@ -36,8 +37,8 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
class TestSigmoidCrossEntropyWithLogitsOp2(OpTest): class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
'''Test sigmoid_cross_entropy_with_logit_op with probabalistic labels """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
''' """
def setUp(self): def setUp(self):
self.op_type = "sigmoid_cross_entropy_with_logits" self.op_type = "sigmoid_cross_entropy_with_logits"
...@@ -47,16 +48,16 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest): ...@@ -47,16 +48,16 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
'X': logit( 'X': logit(
np.random.uniform(0, 1, (batch_size, num_classes)) np.random.uniform(0, 1, (batch_size, num_classes))
.astype("float32")), .astype("float32")),
'Labels': np.random.uniform(0, 1, (batch_size, num_classes)) 'Label': np.random.uniform(0, 1, (batch_size, num_classes))
.astype("float32") .astype("float32")
} }
# Fw Pass is implemented as elementwise sigmoid followed by # Fw Pass is implemented as elementwise sigmoid followed by
# elementwise logistic loss # elementwise logistic loss
# Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X)) # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
sigmoid_X = expit(self.inputs['X']) sigmoid_X = expit(self.inputs['X'])
term1 = self.inputs['Labels'] * np.log(sigmoid_X) term1 = self.inputs['Label'] * np.log(sigmoid_X)
term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X) term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
self.outputs = {'Out': -term1 - term2} self.outputs = {'Out': -term1 - term2}
def test_check_output(self): def test_check_output(self):
...@@ -64,3 +65,7 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest): ...@@ -64,3 +65,7 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Out') self.check_grad(['X'], 'Out')
if __name__ == '__main__':
unittest.main()
import unittest
import numpy as np
from op_test import OpTest
def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
s0, s1, s2, s3 = input.shape
out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
out = np.zeros((s0, s1, out_hsize, out_wsize))
for nidx in xrange(s0):
for cidx in xrange(s1):
for h in xrange(s2):
for w in xrange(s3):
index = indices[nidx, cidx, h, w]
hidx = (index - index % out_wsize) / out_wsize
widx = index % out_wsize
out[nidx, cidx, int(hidx), int(widx)] = \
input[nidx, cidx, h, w]
return out
class TestUnpoolOp(OpTest):
def setUp(self):
self.op_type = "unpool"
self.init_test_case()
pre_input = np.random.random(self.shape).astype("float32")
nsize, csize, hsize, wsize = pre_input.shape
hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \
self.strides[0] + 1
wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \
self.strides[1] + 1
input = np.zeros((nsize, csize, hsize_out, wsize_out))
indices = np.zeros((nsize, csize, hsize_out, wsize_out))
for i in xrange(hsize_out):
for j in xrange(wsize_out):
r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
r_end = np.min((i * self.strides[0] + self.ksize[0] - \
self.paddings[0], hsize))
c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
c_end = np.min((j * self.strides[1] + self.ksize[1] - \
self.paddings[1], wsize))
for nidx in xrange(nsize):
for cidx in xrange(csize):
x_masked = pre_input[nidx, cidx, r_start:r_end, \
c_start:c_end]
input[nidx, cidx, i, j] = x_masked.max()
arg = x_masked.argmax()
indices[nidx, cidx, i, j] = \
(r_start + arg / self.ksize[1]) * wsize + \
c_start + arg % self.ksize[1]
output = self.unpool2d_forward_naive(input, indices, self.ksize, \
self.strides, self.paddings).astype("float32")
self.inputs = {
'X': input.astype('float32'),
'Indices': indices.astype('int32')
}
self.attrs = {
'strides': self.strides,
'paddings': self.paddings,
'ksize': self.ksize,
'unpooling_type': self.unpooling_type,
}
self.outputs = {'Out': output.astype('float32')}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
def init_test_case(self):
self.unpool2d_forward_naive = unpool2dmax_forward_naive
self.unpooling_type = "max"
self.shape = [6, 4, 5, 5]
self.ksize = [3, 3]
self.strides = [2, 2]
self.paddings = [0, 0]
if __name__ == '__main__':
unittest.main()
import unittest import unittest
from paddle.v2.fluid.framework import g_main_program, Program, convert_np_dtype_to_dtype_ from paddle.v2.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import numpy as np import numpy as np
...@@ -18,7 +18,7 @@ class TestVariable(unittest.TestCase): ...@@ -18,7 +18,7 @@ class TestVariable(unittest.TestCase):
self.assertRaises(ValueError, lambda: convert("int8")) self.assertRaises(ValueError, lambda: convert("int8"))
def test_var(self): def test_var(self):
b = g_main_program.current_block() b = default_main_program().current_block()
w = b.create_var( w = b.create_var(
dtype="float64", shape=[784, 100], lod_level=0, name="fc.w") dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
self.assertNotEqual(str(w), "") self.assertNotEqual(str(w), "")
......
...@@ -14,13 +14,16 @@ ...@@ -14,13 +14,16 @@
__all__ = [ __all__ = [
'map_readers', 'buffered', 'compose', 'chain', 'shuffle', 'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
'ComposeNotAligned', 'firstn', 'xmap_readers' 'ComposeNotAligned', 'firstn', 'xmap_readers', 'pipe_reader'
] ]
from threading import Thread
import subprocess
from Queue import Queue
import itertools import itertools
import random import random
from Queue import Queue import zlib
from threading import Thread
def map_readers(func, *readers): def map_readers(func, *readers):
...@@ -323,3 +326,101 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False): ...@@ -323,3 +326,101 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
yield sample yield sample
return xreader return xreader
def _buf2lines(buf, line_break="\n"):
# FIXME: line_break should be automatically configured.
lines = buf.split(line_break)
return lines[:-1], lines[-1]
def pipe_reader(left_cmd,
parser,
bufsize=8192,
file_type="plain",
cut_lines=True,
line_break="\n"):
"""
pipe_reader read data by stream from a command, take it's
stdout into a pipe buffer and redirect it to the parser to
parse, then yield data as your desired format.
You can using standard linux command or call another program
to read data, from HDFS, Ceph, URL, AWS S3 etc:
cmd = "hadoop fs -cat /path/to/some/file"
cmd = "cat sample_file.tar.gz"
cmd = "curl http://someurl"
cmd = "python print_s3_bucket.py"
A sample parser:
def sample_parser(lines):
# parse each line as one sample data,
# return a list of samples as batches.
ret = []
for l in lines:
ret.append(l.split(" ")[1:5])
return ret
:param left_cmd: command to excute to get stdout from.
:type left_cmd: string
:param parser: parser function to parse lines of data.
if cut_lines is True, parser will receive list
of lines.
if cut_lines is False, parser will receive a
raw buffer each time.
parser should return a list of parsed values.
:type parser: callable
:param bufsize: the buffer size used for the stdout pipe.
:type bufsize: int
:param file_type: can be plain/gzip, stream buffer data type.
:type file_type: string
:param cut_lines: whether to pass lines instead of raw buffer
to the parser
:type cut_lines: bool
:param line_break: line break of the file, like \n or \r
:type line_break: string
:return: the reader generator.
:rtype: callable
"""
if not isinstance(left_cmd, str):
raise TypeError("left_cmd must be a string")
if not callable(parser):
raise TypeError("parser must be a callable object")
process = subprocess.Popen(
left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
# TODO(typhoonzero): add a thread to read stderr
# Always init a decompress object is better than
# create in the loop.
dec = zlib.decompressobj(
32 + zlib.MAX_WBITS) # offset 32 to skip the header
def reader():
remained = ""
while True:
buff = process.stdout.read(bufsize)
if buff:
if file_type == "gzip":
decomp_buff = dec.decompress(buff)
elif file_type == "plain":
decomp_buff = buff
else:
raise TypeError("file_type %s is not allowed" % file_type)
if cut_lines:
lines, remained = _buf2lines(''.join(
[remained, decomp_buff]), line_break)
parsed_list = parser(lines)
for ret in parsed_list:
yield ret
else:
for ret in parser(decomp_buff):
yield ret
else:
break
return reader
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册