提交 a061a85e 编写于 作者: W Wang,Jeff

Merge branch 'develop' into updateWriteDocsCN

......@@ -16,6 +16,8 @@ cmake_minimum_required(VERSION 3.0)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
include(system)
......@@ -54,6 +56,7 @@ option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
# CMAKE_BUILD_TYPE
......@@ -67,9 +70,6 @@ if(ANDROID OR IOS)
if(ANDROID)
if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
# TODO: support glog for Android api 16 ~ 19 in the future
message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
endif()
endif()
......@@ -83,6 +83,8 @@ if(ANDROID OR IOS)
"Disable RDMA when cross-compiling for Android and iOS" FORCE)
set(WITH_MKL OFF CACHE STRING
"Disable MKL when cross-compiling for Android and iOS" FORCE)
set(WITH_GOLANG OFF CACHE STRING
"Disable golang when cross-compiling for Android and iOS" FORCE)
# Compile PaddlePaddle mobile inference library
if (NOT WITH_C_API)
......
......@@ -6,10 +6,21 @@ width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
use_gpu = get_config_arg('use_gpu', bool, True)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
is_infer = get_config_arg("is_infer", bool, False)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer
}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
"train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings(
batch_size=batch_size,
......@@ -146,7 +157,6 @@ def inception(name, input, channels, \
return cat
lab = data_layer(name="label", size=1000)
data = data_layer(name="input", size=3 * height * width)
# stage 1
......@@ -224,6 +234,10 @@ pool5 = img_pool_layer(
dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
out3 = fc_layer(
name="output3", input=dropout, size=1000, act=SoftmaxActivation())
loss3 = cross_entropy(name='loss3', input=out3, label=lab)
outputs(loss3)
if is_infer:
outputs(out3)
else:
lab = data_layer(name="label", size=num_class)
loss3 = cross_entropy(name='loss3', input=out3, label=lab)
outputs(loss3)
......@@ -13,14 +13,20 @@ def initHook(settings, height, width, color, num_class, **kwargs):
settings.data_size = settings.height * settings.width * 3
else:
settings.data_size = settings.height * settings.width
settings.slots = [dense_vector(settings.data_size), integer_value(1)]
settings.is_infer = kwargs.get('is_infer', False)
if settings.is_infer:
settings.slots = [dense_vector(settings.data_size)]
else:
settings.slots = [dense_vector(settings.data_size), integer_value(1)]
@provider(
init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_list):
for i in xrange(1024):
for i in xrange(2560 if settings.is_infer else 1024):
img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
lab = random.randint(0, settings.num_class - 1)
yield img.astype('float32'), int(lab)
if settings.is_infer:
yield img.astype('float32')
else:
lab = random.randint(0, settings.num_class - 1)
yield img.astype('float32'), int(lab)
......@@ -6,11 +6,21 @@ width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg("layer_num", int, 50)
is_test = get_config_arg("is_test", bool, False)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
is_infer = get_config_arg("is_infer", bool, False)
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer
}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
"train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings(
batch_size=batch_size,
......@@ -45,7 +55,10 @@ def conv_bn_layer(name,
act=LinearActivation(),
bias_attr=False)
return batch_norm_layer(
name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
name=name + "_bn",
input=tmp,
act=active_type,
use_global_stats=is_infer)
def bottleneck_block(name, input, num_filters1, num_filters2):
......@@ -207,7 +220,9 @@ elif layer_num == 152:
else:
print("Wrong layer number.")
lbl = data_layer(name="label", size=num_class)
loss = cross_entropy(name='loss', input=resnet, label=lbl)
inputs(img, lbl)
outputs(loss)
if is_infer:
outputs(resnet)
else:
lbl = data_layer(name="label", size=num_class)
loss = cross_entropy(name='loss', input=resnet, label=lbl)
outputs(loss)
set -e
function clock_to_seconds() {
hours=`echo $1 | awk -F ':' '{print $1}'`
mins=`echo $1 | awk -F ':' '{print $2}'`
secs=`echo $1 | awk -F ':' '{print $3}'`
echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
}
function infer() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
topology=$1
layer_num=$2
bs=$3
use_mkldnn=$4
if [ $4 == "True" ]; then
thread=1
log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
elif [ $4 == "False" ]; then
thread=`nproc`
if [ $thread -gt $bs ]; then
thread=$bs
fi
log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
else
echo "Wrong input $4, use True or False."
exit 0
fi
models_in="models/${topology}-${layer_num}/pass-00000/"
if [ ! -d $models_in ]; then
echo "Training model ${topology}_${layer_num}"
paddle train --job=train \
--config="${topology}.py" \
--use_mkldnn=True \
--use_gpu=False \
--trainer_count=1 \
--num_passes=1 \
--save_dir="models/${topology}-${layer_num}" \
--config_args="batch_size=128,layer_num=${layer_num}" \
> /dev/null 2>&1
echo "Done"
fi
log_period=$((256 / bs))
paddle train --job=test \
--config="${topology}.py" \
--use_mkldnn=$use_mkldnn \
--use_gpu=False \
--trainer_count=$thread \
--log_period=$log_period \
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
--init_model_path=$models_in \
2>&1 | tee ${log}
# calculate the last 5 logs period time of 1280 samples,
# the time before are burning time.
start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
start_sec=`clock_to_seconds $start`
end_sec=`clock_to_seconds $end`
fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
echo " " > train.list
fi
if [ ! -f "test.list" ]; then
echo " " > test.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
if [ ! -d "models" ]; then
mkdir -p models
fi
# inference benchmark
for use_mkldnn in True False; do
for batchsize in 1 2 4 8 16; do
infer googlenet v1 $batchsize $use_mkldnn
infer resnet 50 $batchsize $use_mkldnn
infer vgg 19 $batchsize $use_mkldnn
done
done
......@@ -8,13 +8,13 @@ function train() {
use_mkldnn=$4
if [ $4 == "True" ]; then
thread=1
log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
elif [ $4 == "False" ]; then
thread=`nproc`
# each trainer_count use only 1 core to avoid conflict
log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
else
echo "Wrong input $3, use True or False."
echo "Wrong input $4, use True or False."
exit 0
fi
args="batch_size=${bs},layer_num=${layer_num}"
......@@ -30,13 +30,14 @@ function train() {
2>&1 | tee ${log}
}
if [ ! -d "train.list" ]; then
if [ ! -f "train.list" ]; then
echo " " > train.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
# training benchmark
for use_mkldnn in True False; do
for batchsize in 64 128 256; do
train vgg 19 $batchsize $use_mkldnn
......
......@@ -6,10 +6,21 @@ width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg('layer_num', int, 19)
is_infer = get_config_arg("is_infer", bool, False)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
args = {
'height': height,
'width': width,
'color': True,
'num_class': num_class,
'is_infer': is_infer
}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
"train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings(
batch_size=batch_size,
......@@ -98,6 +109,9 @@ elif layer_num == 19:
else:
print("Wrong layer number.")
lab = data_layer('label', num_class)
loss = cross_entropy(input=vgg, label=lab)
outputs(loss)
if is_infer:
outputs(vgg)
else:
lab = data_layer('label', num_class)
loss = cross_entropy(input=vgg, label=lab)
outputs(loss)
......@@ -13,7 +13,7 @@
# limitations under the License.
#
IF(MOBILE_INFERENCE)
IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
return()
ENDIF()
......
......@@ -26,12 +26,21 @@ ENDIF(WIN32)
INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
# Using the unofficial glog for Android API < 21
SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git")
SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8")
ELSE()
SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
SET(GLOG_TAG "v0.3.5")
ENDIF()
ExternalProject_Add(
extern_glog
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS gflags
GIT_REPOSITORY "https://github.com/google/glog.git"
GIT_TAG v0.3.5
GIT_REPOSITORY ${GLOG_REPOSITORY}
GIT_TAG ${GLOG_TAG}
PREFIX ${GLOG_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......
......@@ -13,7 +13,7 @@
# limitations under the License.
#
IF(MOBILE_INFERENCE)
IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
return()
ENDIF()
......@@ -24,9 +24,9 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
IF(APPLE)
SET(BUILD_CMD make -n | sed "s/-Werror//g" | sh)
SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
ELSE()
SET(BUILD_CMD make)
SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
ENDIF()
ExternalProject_Add(
......@@ -42,7 +42,7 @@ ExternalProject_Add(
# Disable -Werror, otherwise the compile will fail in MacOS.
# It seems that we cannot configure that by make command.
# Just dry run make command and remove `-Werror`, then use a shell to run make commands
BUILD_COMMAND ${BUILD_CMD} HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin
BUILD_COMMAND ${BUILD_CMD}
INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
)
......
......@@ -188,14 +188,26 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
ENDIF()
SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
IF(MOBILE_INFERENCE)
# The reason why the official version is not used is described in
# https://github.com/PaddlePaddle/Paddle/issues/6114
SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git")
SET(PROTOBUF_TAG "v3.2.0")
IF(NOT BUILD_FOR_HOST)
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF")
ENDIF()
ENDIF()
ExternalProject_Add(
${TARGET_NAME}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${PROTOBUF_SOURCES_DIR}
UPDATE_COMMAND ""
DEPENDS zlib
GIT_REPOSITORY "https://github.com/google/protobuf.git"
GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
GIT_REPOSITORY ${PROTOBUF_REPO}
GIT_TAG ${PROTOBUF_TAG}
CONFIGURE_COMMAND
${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
${OPTIONAL_ARGS}
......@@ -213,7 +225,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
)
ENDFUNCTION()
SET(PROTOBUF_VERSION 3.1)
IF(NOT MOBILE_INFERENCE)
SET(PROTOBUF_VERSION 3.1)
ELSE()
SET(PROTOBUF_VERSION 3.2)
ENDIF()
IF(CMAKE_CROSSCOMPILING)
build_protobuf(protobuf_host TRUE)
LIST(APPEND external_project_dependencies protobuf_host)
......
......@@ -111,6 +111,8 @@ set(COMMON_FLAGS
-Wno-error=sign-compare
-Wno-error=unused-local-typedefs
-Wno-error=parentheses-equality # Warnings in pybind11
-Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3
-Wno-error=terminate # Warning in PADDLE_ENFORCE
)
set(GPU_COMMON_FLAGS
......
......@@ -227,8 +227,8 @@ function(cc_test TARGET_NAME)
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endfunction(cc_test)
......@@ -288,8 +288,8 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_test(${TARGET_NAME} ${TARGET_NAME})
endif()
endfunction(nv_test)
......@@ -505,12 +505,12 @@ function(grpc_library TARGET_NAME)
set_source_files_properties(
${grpc_grpc_srcs}
PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
set_source_files_properties(
${grpc_library_SRCS}
PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
endfunction()
......@@ -7,3 +7,4 @@ API
v2/model_configs.rst
v2/data.rst
v2/run_logic.rst
v2/fluid.rst
======================
Fluid
======================
.. toctree::
:maxdepth: 1
fluid/layers.rst
fluid/data_feeder.rst
fluid/executor.rst
fluid/initializer.rst
fluid/evaluator.rst
fluid/nets.rst
fluid/optimizer.rst
fluid/param_attr.rst
fluid/profiler.rst
fluid/regularizer.rst
===========
DataFeeder
===========
DataFeeder
-----------
.. automodule:: paddle.v2.fluid.data_feeder
:members: DataFeeder
:noindex:
===========
Evaluator
===========
Evaluator
-----------
.. automodule:: paddle.v2.fluid.evaluator
:members: Evaluator
:noindex:
===========
Executor
===========
Executor
-----------
.. automodule:: paddle.v2.fluid.executor
:members: Executor
:noindex:
===========
Initializer
===========
Initializer
-----------
.. automodule:: paddle.v2.fluid.initializer
:members: Initializer
:noindex:
ConstantInitializer
-------------------
.. automodule:: paddle.v2.fluid.initializer
:members: ConstantInitializer
:noindex:
UniformInitializer
------------------
.. automodule:: paddle.v2.fluid.initializer
:members: UniformInitializer
:noindex:
NormalInitializer
-----------------
.. automodule:: paddle.v2.fluid.initializer
:members: NormalInitializer
:noindex:
XavierInitializer
-----------------
.. automodule:: paddle.v2.fluid.initializer
:members: XavierInitializer
:noindex:
MSRAInitializer
---------------
.. automodule:: paddle.v2.fluid.initializer
:members: MSRAInitializer
:noindex:
==========
Layers
==========
fc
---
.. autofunction:: paddle.v2.fluid.layers.fc
:noindex:
embedding
---------
.. autofunction:: paddle.v2.fluid.layers.embedding
:noindex:
dynamic_lstm
------------
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
:noindex:
data
---------
.. autofunction:: paddle.v2.fluid.layers.data
:noindex:
mean
---------
.. autofunction:: paddle.v2.fluid.layers.mean
:noindex:
mul
---------
.. autofunction:: paddle.v2.fluid.layers.mul
:noindex:
elementwise_add
---------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_add
:noindex:
elementwise_div
---------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_div
:noindex:
dropout
---------
.. autofunction:: paddle.v2.fluid.layers.dropout
:noindex:
reshape
---------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex:
sigmoid
---------
.. autofunction:: paddle.v2.fluid.layers.sigmoid
:noindex:
scale
---------
.. autofunction:: paddle.v2.fluid.layers.scale
:noindex:
reshape
---------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex:
transpose
---------
.. autofunction:: paddle.v2.fluid.layers.transpose
:noindex:
sigmoid_cross_entropy_with_logits
---------
.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
:noindex:
cast
---------
.. autofunction:: paddle.v2.fluid.layers.cast
:noindex:
concat
---------
.. autofunction:: paddle.v2.fluid.layers.concat
:noindex:
sums
---------
.. autofunction:: paddle.v2.fluid.layers.sums
:noindex:
linear_chain_crf
---------
.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
:noindex:
assign
---------
.. autofunction:: paddle.v2.fluid.layers.embedding
:noindex:
split_lod_tensor
---------
.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex:
merge_lod_tensor
---------
.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
:noindex:
cos_sim
---------
.. autofunction:: paddle.v2.fluid.layers.cos_sim
:noindex:
cross_entropy
---------
.. autofunction:: paddle.v2.fluid.layers.cross_entropy
:noindex:
square_error_cost
---------
.. autofunction:: paddle.v2.fluid.layers.square_error_cost
:noindex:
accuracy
---------
.. autofunction:: paddle.v2.fluid.layers.accuracy
:noindex:
sequence_conv
---------
.. autofunction:: paddle.v2.fluid.layers.sequence_conv
:noindex:
conv2d
---------
.. autofunction:: paddle.v2.fluid.layers.conv2d
:noindex:
sequence_pool
---------
.. autofunction:: paddle.v2.fluid.layers.sequence_pool
:noindex:
pool2d
---------
.. autofunction:: paddle.v2.fluid.layers.pool2d
:noindex:
batch_norm
---------
.. autofunction:: paddle.v2.fluid.layers.batch_norm
:noindex:
beam_search_decode
---------
.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
:noindex:
lstm
---------
.. autofunction:: paddle.v2.fluid.layers.lstm
:noindex:
lod_rank_table
---------
.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex:
max_sequence_len
---------
.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex:
topk
---------
.. autofunction:: paddle.v2.fluid.layers.topk
:noindex:
lod_tensor_to_array
---------
.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
:noindex:
array_to_lod_tensor
---------
.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
:noindex:
fill_constant
---------
.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex:
fill_constant_batch_size_like
---------
.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
:noindex:
ones
---------
.. autofunction:: paddle.v2.fluid.layers.ones
:noindex:
zeros
---------
.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex:
increment
---------
.. autofunction:: paddle.v2.fluid.layers.increment
:noindex:
array_write
---------
.. autofunction:: paddle.v2.fluid.layers.array_write
:noindex:
create_array
---------
.. autofunction:: paddle.v2.fluid.layers.create_array
:noindex:
less_than
---------
.. autofunction:: paddle.v2.fluid.layers.less_than
:noindex:
array_read
---------
.. autofunction:: paddle.v2.fluid.layers.array_read
:noindex:
shrink_memory
---------
.. autofunction:: paddle.v2.fluid.layers.shrink_memory
:noindex:
array_length
---------
.. autofunction:: paddle.v2.fluid.layers.array_length
:noindex:
conv2d_transpose
---------
.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
:noindex:
===========
Nets
===========
simple_img_conv_pool
-----------
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
:noindex:
img_conv_group
-----------
.. autofunction:: paddle.v2.fluid.nets.img_conv_group
:noindex:
sequence_conv_pool
-----------
.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
:noindex:
===========
Optimizer
===========
Optimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: Optimizer
:noindex:
SGDOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: SGDOptimizer
:noindex:
MomentumOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: MomentumOptimizer
:noindex:
AdagradOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdagradOptimizer
:noindex:
AdamOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamOptimizer
:noindex:
AdamaxOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamaxOptimizer
:noindex:
DecayedAdagradOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: DecayedAdagradOptimizer
:noindex:
===========
ParamAttr
===========
ParamAttr
-----------
.. automodule:: paddle.v2.fluid.param_attr
:members: ParamAttr
:noindex:
===========
Profiler
===========
Profiler
-----------
.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler
:noindex:
===========
Regularizer
===========
WeightDecayRegularizer
-----------
.. automodule:: paddle.v2.fluid.regularizer
:members: WeightDecayRegularizer
:noindex:
L2DecayRegularizer
-----------
.. automodule:: paddle.v2.fluid.regularizer
:members: L2DecayRegularizer
:noindex:
L1DecayRegularizer
-----------
.. automodule:: paddle.v2.fluid.regularizer
:members: L1DecayRegularizer
## Evaluator Design
### The Problem
### Problem Statement
During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted.
During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
### Evaluator Design
Currently, every operation is expressed in the graph. we divide the evaluator process into three steps.
Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
1. Initialize the metric state and add it into the block.
2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once.
2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
### Implementation
This design is shown in python API.
Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass.
This design is shown in the Python API.
Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass.
```python
......
# Intel® MKL-DNN on PaddlePaddle: Design Doc
我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle,充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。
我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn)
(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle,
充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。
我们短期内的基本目标是:
<div align="center">
<img src="image/overview.png"><br/>
Figure 1. PaddlePaddle on IA
</div>
近期目标
- 完成常用layer的MKL-DNN实现。
- 完成常用Layer的MKL-DNN实现。
- 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。
目前的优化,主要针对PaddlePaddle在重构之前的代码框架以及V1的API。
具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)
## Contents
- [Overview](#overview)
- [Actions](#actions)
- [CMake](#cmake)
- [Matrix](#matrix)
- [Layers](#layers)
- [Activations](#activations)
- [Weights](#weights)
- [Parameters](#parameters)
- [Gradients](#gradients)
- [Unit Tests](#unit-tests)
- [Protobuf Messages](#protobuf-messages)
- [Python API](#python-api)
- [Demos](#demos)
- [Benchmarking](#benchmarking)
- [Others](#others)
- [Design Concerns](#design-concerns)
## Overview
我们会把MKL-DNN作为第三方库集成进PaddlePaddle,整体框架图
我们会把MKL-DNN会作为第三方库集成进PaddlePaddle,与其他第三方库一样,会在编译PaddlePaddle的时候下载并编译MKL-DNN。
同时,为了进一步提升PaddlePaddle在基本数学运算的计算速度,我们也将MKLML即(MKL small library\[[1](#references)\])
作为另一个第三方库集成进PaddlePaddle,它只会包括生成好的动态库和头文件。
MKL,MKLML以及MKL-DNN三者关系如下表:
| Name | Open Source | License | Descriptions |
| :---------- | :--------------- | :---------- | :------------ |
| MKL | No | Proprietary | Accelerate math processing routines |
| MKLML | No | Proprietary | Small package of MKL, especially for Machine Learning |
| MKL-DNN | Yes | Apache 2.0 | Accelerate primitives processing routines especially for Deep Neural Networks |
MKLML可以与MKL-DNN共同使用,以此达到最好的性能。
<div align="center">
<img src="image/overview.png" width=350><br/>
Figure 1. PaddlePaddle on IA.
<img src="image/engine.png"><br/>
Figure 2. PaddlePaddle with MKL Engines
</div>
## Actions
我们把集成方案大致分为了如下几个方面。
添加的相关文件和目录结构如下:
```txt
PaddlePaddle/Paddle
├── ...
├── cmake/
│ ├── external/
│ │ ├── ...
│ │ ├── mkldnn.cmake
│ │ └── mklml.cmake
└── paddle/
├── ...
├── math/
│ ├── ...
│ └── MKLDNNMatrix.*
└── gserver/
├── ...
├── layers/
│ ├── ...
│ └── MKLDNN*Layer.*
├── activations/
│ ├── ...
│ └── MKLDNNActivations.*
└── tests/
├── ...
├── MKLDNNTester.*
└── test_MKLDNN.cpp
```
### CMake
我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关,他是负责`WITH_MKLML``WITH_MKLDNN`的总开关。
`CMakeLists.txt`中提供一个与MKL有关的总开关:`WITH_MKL`,它负责决定编译时是否使用MKLML和MKL-DNN
当打开`WITH_MKL`时,会开启MKLML的功能,作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上,同时会开启MKL-DNN功能。
- `WITH_MKLML` 控制是否使用MKLML库。
当打开`WITH_MKL`时,会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。
编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
MKLML的库目前都是动态库,主要包括`libiomp5.so``libmklml_intel.so`
- `WITH_MKLDNN` 控制是否使用MKL-DNN。
当开启`WITH_MKL`时,会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。
编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。
MKL-DNN的库目前只有动态库`libmkldnn.so`
当关闭`WITH_MKL`时,MKLML和MKL-DNN功能会同时关闭。
### Matrix
目前在PaddlePaddle中数据都是以`NCHW`的格式存储,但是在MKL-DNN中的排列方式不止这一种。
所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。
<div align="center">
<img src="image/matrix.png"><br/>
Figure 3. MKLDNNMatrix
</div>
### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在
`paddle/gserver/layers`中,并且文件名都会一以*MKLDNN*开头。
所有MKL-DNN的Layers都会继承于`MKLDNNLayer`,该类继承于PaddlePaddle的基类`Layer`
`MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward``backward`的基本逻辑,
子类只需要使用定义好的接口,实现具体的函数功能即可。
<div align="center">
<img src="image/layers.png"><br/>
Figure 4. MKLDNNLayer
</div>
每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix:
所有MKL-DNN的layers都会继承于一个叫做`MKLDNNLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`
- 内部存储(internel memory):`inVal_`,`inGrad_`,`outVal_``outGrad_`,分别代表输入数据,输入梯度,输出数据和输出梯度。
- 外部存储(external memory):都是以ext开头,比如`extInVal_``extInGrad_`,它们主要是用于,
当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时,转换内存的工作。
需要注意的是,PaddlePaddle的activation会直接使用`output_.value``output_.grad`
所以`extOutVal_``extOutGrad_`必须分别与`output_.value``output_.grad`共享内存,
如果不需要外部存储用于转换,那么对应的内部存储也会与它们共享内存。
- 转换函数(resetXXX): 包括`resetInValue``resetInGrad``resetOutValue``resetOutGrad`
表示对输入数据,输入梯度,输出数据和输出梯度的转换。
这些函数会根据输入参数重新设置内部和外部存储,当然这两者也可以相等,即表示不需要转换。
`MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward``backward`的基本逻辑。部分函数定义为纯虚函数,子类只需要实现这些函数即可
注意:每个`MKLDNNlayer`的子类只需要使用内部存储就可以了,所有外部的转换工作都会在reset系列函数中都准备好
### Activations
由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加`MKLDNNActivation.h``MKLDNNActivation.cpp`文件用于定义和使用MKL-DNN的接口。
在重构前的PaddlePaddle中,激活函数是独立于`Layer`的概念,并且输入输出都是共用一块内存,
所以添加了对应的`MKLDNNActivation`来实现,方式类似于`MKLDNNLayer`
### Parameters
对于有参数的层,我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。
如果存在数据排列格式不一样的情况时,我们会在网络训练之前把格式转换为MKL-DNN希望的格式,
在训练结束的时候再保存为PaddlePaddle的格式,但是整个训练过程中不需要任何转换。
这样既使得最终保存的参数格式与PaddlePaddle一致,又可以避免不必要的转换。
### Gradients
由于MKL-DNN的操作都是直接覆盖的形式,也就是说输出的结果不会在原来的数据上累加,
这样带来的好处就是不需要一直清空memory,节省了不必要的操作。
但是注意的是,当网络出现分支且在`backward`的时候,需要累加不同Layer传过来的梯度。
所以在`MKLDNNlayer`中实现了一个merge的方法,此时每个小分支的`Input Gradient`
会先临时保存在`MKLDNNMatrix`中,由分支处的Layer负责求和,并把结果放到当前层的`output_.grad`中。
所以整体上,在实现每个子类的时候就不需要关心分支的事情了。
### Weights
由于有些layer是含有参数的,我们会尽量让MKL-DNN的参数与PaddlePaddle中`parameter`共享一块内存。
同时,由于MKL-DNN在训练时使用的参数layout可能与PaddlePaddle默认的`nchw`不一致,我们会在网络训练的开始和结束时分别转换这个layout,使得最终保存的参数格式与PaddlePaddle一致。
<div align="center">
<img src="image/gradients.png"><br/>
Figure 5. Merge Gradients
</div>
### Unit Tests
会在`paddle/gserver/test`目录下添加`test_MKLDNN.cpp``MKLDNNTester.*`用于MKL-DNN的测试。
测试分为每个layer(或activation)的单元测试和简单网络的整体测试。
我们会添加`test_MKLDNN.cpp``MKLDNNTester.*`用于MKL-DNN的测试。
测试分为每个Layer(或Activation)的单元测试和简单网络的整体测试。
每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果,小于某个比较小的阈值认为通过。
### Protobuf Messages
根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
### Python API
目前只考虑**v1 API**
......@@ -80,41 +172,40 @@ if use_mkldnn
self.layer_type = mkldnn_*
```
所有MKL-DNN的layer type会以*mkldnn_*开头,以示区分。
并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py ``layers.py`里面添加必要的MKL-DNN的接口。
所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。
### Demos
会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。
同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
### Benchmarking
会添加`benchmark/paddle/image/run_mkldnn.sh`,用于测试使用MKL-DNN之后的性能。
会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image),用于测试和对比在使用MKL-DNN前后的CNN网络性能。
测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md)
### Others
1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64
1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为4096,具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)
2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。
## Design Concerns
为了更好的符合PaddlePaddle的代码风格\[[2](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]
为了更好的符合PaddlePaddle的代码风格\[[3](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]
我们总结出一些特别需要注意的点:
1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2``MKLDNNLayer`特有的设备ID。
2. 重写父类Layer的**init**函数,修改`deviceId_``-2`,代表这个layer是用于跑在MKL-DNN的环境下。
3. 创建`MKLDNNMatrix`,同时继承`CpuMatrix``mkldnn::memory`。用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
4. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MKLDNNStream``CPUEngine`,和未来可能还会用到`FPGAEngine`等。
5. 每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_``outGrad_`,分别代表input value, input gradient,output value和output gradient。他们会存放MKL-DNN用到的internal memory。同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory),主要是在格式与PaddlePaddle默认的`nchw`格式不匹配时,用于转换内存的工作。必要的转换函数也会在`MKLDNNLayer`中提前定义好,每个子类只需要调用定义好的reset buffer函数即可。
6. 每个`MKLDNNlayer`的resetbuffer相关的函数(包括reset input、output的Value和grad),他们会根据输入参数reset internal和external的memory,当然这两者也可以相等,即表示不需要转换。只需要把握一个原则,每个`MKLDNNlayer`的子类,只需要使用internal的memory就可以了,所有external的转换工作在父类的reset函数中都提前准备好了。
7. 一般来说,external的memory会尽量与PaddlePaddle中的`value``grad`共享内存。同时每个`MKLDNNLayer`中的external output value和gradient(也就是`extOutVal_``extOutGrad_`)必须分别与`output_.value``output_.grad`共享内存,因为PaddlePaddle的activation会直接使用`output_.value``output_.grad`。如果不需要external的buffer用于转换,那么internal的buffer也会与他们共享内存。
8. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value``extOutVal_`共享内存,同时数据格式就是`nchw`,这样下一个cpu device就能拿到正确的数据。在有cpu device的时候,external的memory的格式始终是`nchw`或者`nc`
9. 由于MKL-DNN的输出操作都是覆盖data的,不是在原来的数据上累加,所以当网络出现分支时,在`backward`时会需要merge不同layer的梯度。`MKLDNNlayer`中会实现merge的方法,此时每个小分支的input gradient会先临时保存在一个`MKLDNNMatrix`中,由分支处的layer负责求和,并把结果放到这个layer的`output_.grad`中。所以整体上,每个子类并不会需要关心分支的事情,也是在父类都实现好了。
10. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,
我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2``MKLDNNLayer`特有的设备ID。
2. 重写父类Layer的**init**函数,修改`deviceId_``-2`,代表这个layer是用于跑在MKL-DNN的环境下。
3. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。
包括MKL-DNN会用到`MKLDNNStream``CPUEngine`,和未来可能还会用到`FPGAEngine`等。
4. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value``extOutVal_`共享内存,
同时数据格式就是`NCHW`,这样下一个cpu device就能拿到正确的数据。
在有普通的CPU layer时, `extOutVal_``extOutGrad_`的格式始终是`NCHW`或者`NC`
## References
1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN")
2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。
3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`,所以不存在这个问题),所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。
1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
主要包括了深度学习相关的数学原语与操作,一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)
目前在PaddlePaddle中,仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。
4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`,所以不存在这个问题)。
所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。
此教程会介绍如何使用Python的cProfile包,与Python库yep,google perftools来运行性能分析(Profiling)与调优。
This tutorial introduces techniques we use to profile and tune the
CPU performance of PaddlePaddle. We will use Python packages
`cProfile` and `yep`, and Google's `perftools`.
运行性能分析可以让开发人员科学的,有条不紊的对程序进行性能优化。性能分析是性能调优的基础。因为在程序实际运行中,真正的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。
Profiling is the process that reveals performance bottlenecks,
which could be very different from what's in the developers' mind.
Performance tuning is done to fix these bottlenecks. Performance optimization
repeats the steps of profiling and tuning alternatively.
性能优化的步骤,通常是循环重复若干次『性能分析 --> 寻找瓶颈 ---> 调优瓶颈 --> 性能分析确认调优效果』。其中性能分析是性能调优的至关重要的量化指标。
PaddlePaddle users program AI applications by calling the Python API, which calls
into `libpaddle.so.` written in C++. In this tutorial, we focus on
the profiling and tuning of
Paddle提供了Python语言绑定。用户使用Python进行神经网络编程,训练,测试。Python解释器通过`pybind``swig`调用Paddle的动态链接库,进而调用Paddle C++部分的代码。所以Paddle的性能分析与调优分为两个部分:
1. the Python code and
1. the mixture of Python and C++ code.
* Python代码的性能分析
* Python与C++混合代码的性能分析
## Profiling the Python Code
### Generate the Performance Profiling File
## Python代码的性能分析
### 生成性能分析文件
Python标准库中提供了性能分析的工具包,[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
We can use Python standard
package, [`cProfile`](https://docs.python.org/2/library/profile.html),
to generate Python profiling file. For example:
```bash
python -m cProfile -o profile.out main.py
```
其中`-o`标识了一个输出的文件名,用来存储本次性能分析的结果。如果不指定这个文件,`cProfile`会打印一些统计信息到`stdout`。这不方便我们进行后期处理(进行`sort`, `split`, `cut`等等)。
### 查看性能分析文件
where `main.py` is the program we are going to profile, `-o` specifies
the output file. Without `-o`, `cProfile` would outputs to standard
output.
当main.py运行完毕后,性能分析结果文件`profile.out`就生成出来了。我们可以使用[cprofilev](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务,将性能分析结果以网页的形式展示出来。
### Look into the Profiling File
使用`pip install cprofilev`安装`cprofilev`工具。安装完成后,使用如下命令开启HTTP服务
`cProfile` generates `profile.out` after `main.py` completes. We can
use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into
the details:
```bash
cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
```
其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
where `-a` specifies the HTTP IP, `-p` specifies the port, `-f`
specifies the profiling file, and `main.py` is the source file.
访问对应网址,即可显示性能分析的结果。性能分析结果格式如下:
Open the Web browser and points to the local IP and the specifies
port, we will see the output like the following:
```text
```
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.284 0.284 29.514 29.514 main.py:1(<module>)
4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
......@@ -44,23 +54,23 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
```
每一列的含义是:
where each line corresponds to Python function, and the meaning of
each column is as follows:
| 列名 | 含义 |
| column | meaning |
| --- | --- |
| ncalls | 函数的调用次数 |
| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
| percall | tottime的每次调用平均时间 |
| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
| percall | cumtime的每次调用平均时间 |
| filename:lineno(function) | 文件名, 行号,函数名 |
| ncalls | the number of calls into a function |
| tottime | the total execution time of the function, not including the
execution time of other functions called by the function |
| percall | tottime divided by ncalls |
| cumtime | the total execution time of the function, including the execution time of other functions being called |
| percall | cumtime divided by ncalls |
| filename:lineno(function) | where the function is defined |
### Identify Performance Bottlenecks
### 寻找性能瓶颈
通常`tottime``cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
将性能分析结果按照tottime排序,效果如下:
Usually, `tottime` and the related `percall` time is what we want to
focus on. We can sort above profiling file by tottime:
```text
4696 12.040 0.003 12.040 0.003 {built-in method run}
......@@ -68,12 +78,15 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
```
可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python``C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。
We can see that the most time-consuming function is the `built-in
method run`, which is a C++ function in `libpaddle.so`. We will
explain how to profile C++ code in the next section. At this
moment, let's look into the third function `sync_with_cpp`, which is a
Python function. We can click it to understand more about it:
```text
```
Called By:
Ordered by: internal time
......@@ -92,72 +105,93 @@ Called:
List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
```
通常观察热点函数间的调用关系,和对应行的代码,就可以了解到问题代码在哪里。当我们做出性能修正后,再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
The lists of the callers of `sync_with_cpp` might help us understand
how to improve the function definition.
## Profiling Python and C++ Code
### Generate the Profiling File
## Python与C++混合代码的性能分析
To profile a mixture of Python and C++ code, we can use a Python
package, `yep`, that can work with Google's `perftools`, which is a
commonly-used profiler for C/C++ code.
### 生成性能分析文件
C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
使用`yep`前需要安装`google-perftools``yep`包。ubuntu下安装命令为
In Ubuntu systems, we can install `yep` and `perftools` by running the
following commands:
```bash
apt update
apt install libgoogle-perftools-dev
pip install yep
```
安装完毕后,我们可以通过
Then we can run the following command
```bash
python -m yep -v main.py
```
生成性能分析文件。生成的性能分析文件为`main.py.prof`
to generate the profiling file. The default filename is
`main.py.prof`.
Please be aware of the `-v` command line option, which prints the
analysis results after generating the profiling file. By examining the
the print result, we'd know that if we stripped debug
information from `libpaddle.so` at build time. The following hints
help make sure that the analysis results are readable:
命令行中的`-v`指定在生成性能分析文件之后,在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同,编译时可能会去掉调试信息,运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果,可以采取下面几点措施:
1. Use GCC command line option `-g` when building `libpaddle.so` so to
include the debug information. The standard building system of
PaddlePaddle is CMake, so you might want to set
`CMAKE_BUILD_TYPE=RelWithDebInfo`.
1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`
2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
1. Use GCC command line option `-O2` or `-O3` to generate optimized
binary code. It doesn't make sense to profile `libpaddle.so`
without optimization, because it would anyway run slowly.
### 查看性能分析文件
1. Profiling the single-threaded binary file before the
multi-threading version, because the latter often generates tangled
profiling analysis result. You might want to set environment
variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically
starting multiple threads.
在运行完性能分析后,会生成性能分析结果文件。我们可以使用[pprof](https://github.com/google/pprof)来显示性能分析结果。注意,这里使用了用`Go`语言重构后的`pprof`,因为这个工具具有web服务界面,且展示效果更好。
### Examining the Profiling File
安装`pprof`的命令和一般的`Go`程序是一样的,其命令如下:
The tool we used to examine the profiling file generated by
`perftools` is [`pprof`](https://github.com/google/pprof), which
provides a Web-based GUI like `cprofilev`.
We can rely on the standard Go toolchain to retrieve the source code
of `pprof` and build it:
```bash
go get github.com/google/pprof
```
进而我们可以使用如下命令开启一个HTTP服务:
Then we can use it to profile `main.py.prof` generated in the previous
section:
```bash
pprof -http=0.0.0.0:3213 `which python` ./main.py.prof
```
这行命令中,`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径,进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
访问对应的网址,我们可以查看性能分析的结果。结果如下图所示:
Where `-http` specifies the IP and port of the HTTP service.
Directing our Web browser to the service, we would see something like
the following:
![result](./pprof_1.png)
### Identifying the Performance Bottlenecks
### 寻找性能瓶颈
与寻找Python代码的性能瓶颈类似,寻找Python与C++混合代码的性能瓶颈也是要看`tottime``cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
例如下图中,
Similar to how we work with `cprofilev`, we'd focus on `tottime` and
`cumtime`.
![kernel_perf](./pprof_2.png)
在一次训练中,乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然,`MomentumOp`的性能有问题。
`pprof`中,对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题,再检查其他部分的性能问题,可以更有次序的完成性能的优化。
## 总结
We can see that the execution time of multiplication and the computing
of the gradient of multiplication takes 2% to 4% of the total running
time, and `MomentumOp` takes about 17%. Obviously, we'd want to
optimize `MomentumOp`.
至此,两种性能分析的方式都介绍完毕了。希望通过这两种性能分析的方式,Paddle的开发人员和使用人员可以有次序的,科学的发现和解决性能问题。
`pprof` would mark performance critical parts of the program in
red. It's a good idea to follow the hints.
此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优(performance tuning)。
Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
* Python 代码的性能分析
* Python 与 C++ 混合代码的性能分析
## Python代码的性能分析
### 生成性能分析文件
Python标准库中提供了性能分析的工具包,[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
```bash
python -m cProfile -o profile.out main.py
```
其中 `main.py` 是我们要分析的程序,`-o`标识了一个输出的文件名,用来存储本次性能分析的结果。如果不指定这个文件,`cProfile`会打印到标准输出。
### 查看性能分析文件
`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务,将性能分析结果以网页的形式展示出来:
```bash
cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
```
其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
用Web浏览器访问对应网址,即可显示性能分析的结果:
```
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.284 0.284 29.514 29.514 main.py:1(<module>)
4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
4696 12.040 0.003 12.040 0.003 {built-in method run}
1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
```
每一列的含义是:
| 列名 | 含义 |
| --- | --- |
| ncalls | 函数的调用次数 |
| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
| percall | tottime的每次调用平均时间 |
| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
| percall | cumtime的每次调用平均时间 |
| filename:lineno(function) | 文件名, 行号,函数名 |
### 寻找性能瓶颈
通常`tottime``cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
将性能分析结果按照tottime排序,效果如下:
```text
4696 12.040 0.003 12.040 0.003 {built-in method run}
300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
```
可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python``C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。
```text
Called By:
Ordered by: internal time
List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
Function was called by...
ncalls tottime cumtime
/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
Called:
Ordered by: internal time
List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
```
通常观察热点函数间的调用关系,和对应行的代码,就可以了解到问题代码在哪里。当我们做出性能修正后,再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
## Python与C++混合代码的性能分析
### 生成性能分析文件
C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
使用`yep`前需要安装`google-perftools``yep`包。ubuntu下安装命令为
```bash
apt update
apt install libgoogle-perftools-dev
pip install yep
```
安装完毕后,我们可以通过
```bash
python -m yep -v main.py
```
生成性能分析文件。生成的性能分析文件为`main.py.prof`
命令行中的`-v`指定在生成性能分析文件之后,在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同,编译时可能会去掉调试信息,运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果,可以采取下面几点措施:
1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`
2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
### 查看性能分析文件
在运行完性能分析后,会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意,这里使用了用`Go`语言重构后的`pprof`,因为这个工具具有web服务界面,且展示效果更好。
安装`pprof`的命令和一般的`Go`程序是一样的,其命令如下:
```bash
go get github.com/google/pprof
```
进而我们可以使用如下命令开启一个HTTP服务:
```bash
pprof -http=0.0.0.0:3213 `which python` ./main.py.prof
```
这行命令中,`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径,进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
访问对应的网址,我们可以查看性能分析的结果。结果如下图所示:
![result](./pprof_1.png)
### 寻找性能瓶颈
与寻找Python代码的性能瓶颈类似,寻找Python与C++混合代码的性能瓶颈也是要看`tottime``cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
例如下图中,
![kernel_perf](./pprof_2.png)
在一次训练中,乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然,`MomentumOp`的性能有问题。
`pprof`中,对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题,再检查其他部分的性能问题,可以更有次序的完成性能的优化。
......@@ -4,6 +4,16 @@ else ()
set(PADDLE_FLOAT_TYPE float)
endif()
execute_process(
COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_GIT_COMMIT
RESULT_VARIABLE PADDLE_GIT_COMMIT_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT PADDLE_GIT_COMMIT)
set(PADDLE_GIT_COMMIT "no commit information")
endif()
# config.h used for C-API. It will store Paddle building configuration as a
# header. Make user just include PaddleCAPI.h then can get building
# configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their
......
......@@ -3,6 +3,9 @@
typedef @PADDLE_FLOAT_TYPE@ paddle_real;
#define __PADDLE_VERSION__ "@PADDLE_VERSION@"
#define __PADDLE_COMMIT__ "@PADDLE_GIT_COMMIT@"
// Since we only support linux and macos in compile, always use clang or
// gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
#define PD_API __attribute__((visibility("default")))
......
......@@ -27,6 +27,18 @@
namespace paddle {
namespace framework {
static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
// Control Flow operators's backward is significantly different from
// computational operators. Hack Code here.
// We should design a better way to backward CtrlFlowOps.
static std::unordered_set<std::string>& CtrlFlowOps() {
if (g_ctrl_flow_ops_ == nullptr) {
g_ctrl_flow_ops_ =
new std::unordered_set<std::string>{"increment", "lod_rank_table"};
}
return *g_ctrl_flow_ops_;
}
static inline std::unique_ptr<OperatorBase> CreateGradOp(
const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
std::unordered_map<std::string, std::string>* grad_to_var) {
......@@ -288,12 +300,24 @@ static void CreateGradVarInBlock(
for (size_t op_index = grad_op_start_index; op_index < ops.size();
++op_index) {
std::unordered_set<std::string> new_vars;
auto& ctrl_flow_ops = CtrlFlowOps();
ForEachVarName(ops[op_index]->Outputs(),
[&](const std::string& grad_var_name) {
if (block_desc->HasVar(grad_var_name)) {
if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
ctrl_flow_ops.end()) {
if (block_desc->HasVarRecursive(grad_var_name)) {
return false;
}
} else {
if (block_desc->HasVar(grad_var_name)) {
return false;
}
}
if (grad_var_name == framework::kEmptyVarName) {
return false;
}
auto var = block_desc->Var(grad_var_name);
VLOG(10) << "Creating Variable " << grad_var_name;
new_vars.insert(var->Name());
auto it = param_name_map.find(grad_var_name);
if (it == param_name_map.end()) {
......@@ -333,14 +357,25 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
// All input gradients of forwarding operator do not need to calculate.
const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
if (AllGradInSet(inputs, *no_grad_vars)) {
VLOG(10) << "Drop operator " << op_desc->Type();
return grad_op_descs; // empty vector
}
// All output gradients of forwarding operator do not need to calculate.
const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
if (AllGradInSet(outputs, *no_grad_vars)) {
for (const std::string& name : inputs) {
no_grad_vars->insert(GradVarName(name));
VLOG(10) << "Drop operator " << op_desc->Type();
// FIXME: Hack code here
auto& ctrl_flow_ops = CtrlFlowOps();
if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
// Only computational op need drop input's gradient.
for (const std::string& name : inputs) {
no_grad_vars->insert(GradVarName(name));
VLOG(10) << " Also drop " << GradVarName(name);
}
}
return grad_op_descs; // empty vector
}
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/block_desc.h"
#include "paddle/framework/operator.h"
#include "paddle/framework/program_desc.h"
namespace paddle {
......@@ -42,6 +43,8 @@ bool BlockDescBind::HasVar(const std::string &name) const {
}
VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
if (name == kEmptyVarName) return nullptr;
auto it = vars_.find(name);
if (it == vars_.end()) {
return Parent() == kNoneBlockIndex ? nullptr
......
......@@ -97,6 +97,10 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
if (create_local_scope) {
local_scope = &scope->NewScope();
for (auto& var : block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
}
if (var->Persistable()) {
auto* ptr = scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
......
......@@ -65,7 +65,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
"The %d-th output of Output(%s) must be LoDTensor.", j,
out);
in_var->SetLoDLevel(out_var->GetLodLevel());
out_var->SetLoDLevel(in_var->GetLodLevel());
}
bool IsRuntime() const override;
......@@ -466,7 +466,12 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
try {
return framework::make_ddim(var->Shape());
auto shape = var->Shape();
if (shape.empty()) {
return framework::make_ddim({0UL});
} else {
return framework::make_ddim(var->Shape());
}
} catch (...) {
VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
......
......@@ -36,12 +36,9 @@ Scope& Scope::NewScope() const {
}
Variable* Scope::Var(const std::string& name) {
auto iter = vars_.find(name);
if (iter != vars_.end()) {
VLOG(3) << "Get existing variable " << name;
return iter->second;
}
Variable* v = new Variable();
auto* v = FindVarLocally(name);
if (v != nullptr) return v;
v = new Variable();
vars_[name] = v;
VLOG(3) << "Create variable " << name;
v->name_ = &(vars_.find(name)->first);
......@@ -57,8 +54,10 @@ Variable* Scope::Var(std::string* name) {
}
Variable* Scope::FindVar(const std::string& name) const {
auto it = vars_.find(name);
if (it != vars_.end()) return it->second;
auto var = FindVarLocally(name);
if (var != nullptr) {
return var;
}
return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
}
......@@ -116,6 +115,11 @@ std::string Scope::Rename(const std::string& origin_name) const {
Rename(origin_name, var_name);
return var_name;
}
Variable* Scope::FindVarLocally(const std::string& name) const {
auto it = vars_.find(name);
if (it != vars_.end()) return it->second;
return nullptr;
}
} // namespace framework
} // namespace paddle
......@@ -76,6 +76,8 @@ class Scope {
std::string Rename(const std::string& origin_name) const;
private:
Variable* FindVarLocally(const std::string& name) const;
// Call Scope::NewScope for a sub-scope.
explicit Scope(Scope const* parent) : parent_(parent) {}
......
......@@ -12,6 +12,8 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/shape_inference.h"
#include "grad_op_desc_maker.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace framework {
......@@ -22,6 +24,12 @@ std::vector<framework::DDim> InferShapeContext::GetInputsDim(
return GetDims(names);
}
DDim InferShapeContext::GetInputsElementDim(const std::string &name,
int idx) const {
const std::vector<std::string> &names = Inputs(name);
return this->GetDim(names[idx]);
}
void InferShapeContext::SetOutputsDim(
const std::string &name, const std::vector<framework::DDim> &dims) {
auto &names = Outputs(name);
......@@ -43,6 +51,9 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) {
if (names[i] == framework::kEmptyVarName) {
continue;
}
SetDim(names[i], dims[i]);
}
}
......
......@@ -37,6 +37,7 @@ class InferShapeContext {
virtual framework::DDim GetInputDim(const std::string &name) const = 0;
std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
DDim GetInputsElementDim(const std::string &name, int idx) const;
virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
void SetOutputsDim(const std::string &name,
......
......@@ -21,7 +21,7 @@ template <class T>
struct EigenBlasGemm {
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
Eigen::Aligned>
Matrix;
EigenMatrix;
static void compute(const bool transA,
const bool transB,
......@@ -56,14 +56,13 @@ struct EigenBlasGemm {
sizeB[1] = N;
CHECK_EQ(N, ldb);
}
Eigen::array<int, 2> sizeC;
sizeC[0] = M;
sizeC[1] = N;
CHECK_EQ(N, ldc);
Eigen::array<int, 2> sizeC = {{M, ldc}};
Eigen::array<int, 2> offsetC = {{0, 0}};
Eigen::array<int, 2> extentC = {{M, N}};
const Matrix a(const_cast<T*>(A), sizeA);
const Matrix b(const_cast<T*>(B), sizeB);
Matrix c(C, sizeC);
const EigenMatrix a(const_cast<T*>(A), sizeA);
const EigenMatrix b(const_cast<T*>(B), sizeB);
EigenMatrix c(C, sizeC);
typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims;
......@@ -72,12 +71,23 @@ struct EigenBlasGemm {
dims[0].second = transB ? 1 : 0;
Eigen::DefaultDevice device;
if (alpha == T(1) && beta == T(0)) {
c.device(device) = a.contract(b, dims);
} else if (alpha == T(1) && beta == T(1)) {
c.device(device) += a.contract(b, dims);
if (N == ldc) {
if (alpha == T(1) && beta == T(0)) {
c.device(device) = a.contract(b, dims);
} else if (alpha == T(1) && beta == T(1)) {
c.device(device) += a.contract(b, dims);
} else {
c.device(device) = alpha * a.contract(b, dims) + beta * c;
}
} else {
c.device(device) = alpha * a.contract(b, dims) + beta * c;
if (alpha == T(1) && beta == T(0)) {
c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
} else if (alpha == T(1) && beta == T(1)) {
c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
} else {
c.slice(offsetC, extentC).device(device) =
alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
}
}
}
};
......
# gserver pacakge unittests
add_simple_unittest(test_LinearChainCRF)
add_simple_unittest(test_RecurrentLayer)
......@@ -29,6 +28,26 @@ gserver_test(test_KmaxSeqScore)
gserver_test(test_Expand)
gserver_test(test_MaxPoolingWithMaskOutput)
set(PYTHON_PATH
${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
function(gserver_test_with_python TARGET)
add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
add_test(NAME ${TARGET}
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endfunction()
gserver_test_with_python(test_PyDataProvider2)
if(WITH_PYTHON)
gserver_test_with_python(test_PyDataProvider)
endif()
if(NOT MOBILE_INFERENCE)
gserver_test_with_python(test_CompareTwoNets)
# TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
gserver_test_with_python(test_RecurrentGradientMachine)
endif()
########## test_MKLDNN layers and activations ##########
if(WITH_MKLDNN)
add_unittest_without_exec(test_MKLDNN
......@@ -36,87 +55,43 @@ if(WITH_MKLDNN)
MKLDNNTester.cpp
LayerGradUtil.cpp)
add_test(NAME test_MKLDNN
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif()
############## test_PyDataProvider ########################
if(WITH_PYTHON)
add_unittest_without_exec(test_PyDataProvider
test_PyDataProvider.cpp)
add_test(NAME test_PyDataProvider
COMMAND .set_python_path.sh -d ./gserver/tests:${PADDLE_SOURCE_DIR}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif()
############### test_WarpCTCLayer #######################
if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
add_unittest_without_exec(test_WarpCTCLayer
test_WarpCTCLayer.cpp)
add_test(NAME test_WarpCTCLayer
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif()
if(NOT MOBILE_INFERENCE)
################## test_Evaluator #######################
################## test_Evaluator #############
add_unittest(test_Evaluator
test_Evaluator.cpp)
############### test_RecurrentGradientMachine ###############
# TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
# I will fix it.
add_unittest_without_exec(test_RecurrentGradientMachine
test_RecurrentGradientMachine.cpp)
add_test(NAME test_RecurrentGradientMachine
COMMAND .set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
############### test_NetworkCompare ###############
########### test_NetworkCompare ###############
add_unittest_without_exec(test_NetworkCompare
test_NetworkCompare.cpp)
if(WITH_GPU)
add_test(NAME test_NetworkCompare
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
set(use_gpu true)
else()
add_test(NAME test_NetworkCompare
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
set(use_gpu false)
endif()
endif()
add_unittest_without_exec(test_PyDataProvider2
test_PyDataProvider2.cpp)
add_test(NAME test_PyDataProvider2
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
)
add_test(NAME test_NetworkCompare
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
################# test_CompareSparse ##################
add_unittest_without_exec(test_CompareSparse
test_CompareSparse.cpp)
if(NOT ON_TRAVIS)
add_test(NAME test_CompareSparse
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
./.set_port.sh -p port -n 6
${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
############ test_CompareSparse ################
add_unittest_without_exec(test_CompareSparse
test_CompareSparse.cpp)
if(NOT ON_TRAVIS)
add_test(NAME test_CompareSparse
COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif()
endif()
################ test_CompareTwoNets ######################
add_unittest_without_exec(test_CompareTwoNets
test_CompareTwoNets.cpp)
add_test(NAME test_CompareTwoNets
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
......@@ -41,7 +41,7 @@ nonseq = embedding_layer(input=label, size=word_dim)
# This hierarchical RNN is designed to be equivalent to the simple RNN in
# sequence_rnn_multi_unequalength_inputs.conf
# sequence_rnn_mixed_inputs.conf
def outer_step(subseq, seq, nonseq, encoding):
outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
......
......@@ -37,7 +37,7 @@ encoding = embedding_layer(input=data2, size=word_dim)
# This hierarchical RNN is designed to be equivalent to the simple RNN in
# sequence_rnn_multi_unequalength_inputs.conf
# sequence_rnn_matched_inputs.conf
def outer_step(subseq, seq, nonseq, encoding):
outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
......
......@@ -26,8 +26,6 @@ else()
endif()
if(MOBILE_INFERENCE)
list(REMOVE_ITEM MATH_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/SIMDFunctions.cpp)
# Remove sparse
list(REMOVE_ITEM MATH_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
......
......@@ -116,9 +116,11 @@ inline bool vec_check(size_t len) {
}
namespace internal {
#ifdef __SSE3__
void addToImpl(float* a, const float* b, size_t len);
void batchAddToImpl(float* a, const float* b[], int batch, size_t len);
void colMaxImpl(float* result, const float* data, int dim, int numSamples);
#endif
#ifdef __AVX__
void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len);
void decayL1AvxImpl(
......
......@@ -81,18 +81,33 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
}
template <>
void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
return GetGPUBuddyAllocator(place.device)->Alloc(size);
size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
return GetGPUBuddyAllocator(place.device)->Used();
}
template <>
void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
GetGPUBuddyAllocator(place.device)->Free(p);
void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
auto* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device);
size_t avail, total;
platform::GpuMemoryUsage(avail, total);
LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
<< place.device << ", available " << avail << " bytes";
LOG(WARNING) << "total " << total;
LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
LOG(WARNING) << "GPU memory used: " << Used<platform::GPUPlace>(place);
platform::SetDeviceId(cur_dev);
}
return ptr;
}
template <>
size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
return GetGPUBuddyAllocator(place.device)->Used();
void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
GetGPUBuddyAllocator(place.device)->Free(p);
}
#endif
......
......@@ -212,18 +212,22 @@ set(DEPS_OPS
send_op
recv_op)
if(WITH_DISTRIBUTE)
add_subdirectory(detail)
op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
set_source_files_properties(
send_op.cc
PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
set_source_files_properties(
recv_op.cc
PROPERTIES
COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
endif()
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
op_library(cross_entropy_op DEPS cross_entropy)
......@@ -275,4 +279,3 @@ if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
......@@ -25,7 +25,7 @@ class ConcatOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
"Inputs(X) of ConcatOp should be empty.")
"Inputs(X) of ConcatOp should be empty.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ConcatOp should not be null.");
......@@ -45,7 +45,7 @@ class ConcatOp : public framework::OperatorWithKernel {
}
PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
"Input tensors should have the same "
"elements except the specify axis.")
"elements except the specify axis.");
}
}
ctx->SetOutputDim("Out", out_dims);
......
......@@ -32,4 +32,4 @@ message VariableMessage {
bytes serialized = 2;
}
message VoidMessage {}
\ No newline at end of file
message VoidMessage {}
......@@ -35,7 +35,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
auto x_dim = ctx->GetInputDim("X");
auto y_dim = ctx->GetInputDim("Y");
PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
"Rank of first input must >= rank of second input.")
"Rank of first input must >= rank of second input.");
ctx->SetOutputDim("Out", x_dim);
ctx->ShareLoD("X", /*->*/ "Out");
}
......@@ -120,7 +120,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
"Rank of first input must >= rank of second input.")
"Rank of first input must >= rank of second input.");
auto x_grad_name = framework::GradVarName("X");
auto y_grad_name = framework::GradVarName("Y");
......
......@@ -106,7 +106,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) {
auto x_dims = x->dims();
auto y_dims = y->dims();
PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
"Rank of first input must >= rank of second input.")
"Rank of first input must >= rank of second input.");
if (x_dims == y_dims) {
functor f;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/hinge_loss_op.h"
namespace paddle {
namespace operators {
class HingeLossOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Logits"),
"Input(Logits) must be initialized.");
PADDLE_ENFORCE(ctx->HasInput("Labels"),
"Input(Labels) must be initialized.");
auto pred_dims = ctx->GetInputDim("Logits");
auto label_dims = ctx->GetInputDim("Labels");
PADDLE_ENFORCE_EQ(pred_dims, label_dims);
PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
"The rank of Input(Logits) must be 2 and the shape is "
"[batch_size, 1].");
PADDLE_ENFORCE_EQ(pred_dims[1], 1,
"Each row of Input(Logits) contains a real value, "
"so the 2nd dimension of Input(Logits) must be 1.");
ctx->SetOutputDim("Loss", {pred_dims[0], 1});
ctx->ShareLoD("Logits", "Loss");
}
};
template <typename AttrType>
class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
public:
HingeLossOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Logits",
"The input value (Logits) of Hinge loss op."
"Logits is a 2-D tensor with shape [batch_size, 1].");
AddInput("Labels",
"The target value (Labels) of Hinge loss op."
"Labels is a 2-D tensor with shape [batch_size, 1].");
AddOutput("Loss",
"The output tensor with shape [batch_size, 1] "
"which represents the hinge loss.");
AddComment(R"DOC(
HingeLoss Operator.
Let x be a logit (prediction) and y be the actual label. The logit can
take any values from (-inf, inf), but the labels should be either -1 or 1.
Then, the hinge loss is computed as follows:
$$
L_(x, y) = max(1 - y.x, 0)
$$
Note that the labels passed as input will have values as either 0 or 1.
)DOC");
}
};
class HingeLossGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Logits"),
"Input(Logits) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Labels"),
"Input(Labels) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
"Input(Loss@GRAD) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
"Input(Logits@GRAD) should not be null.");
auto pred_dims = ctx->GetInputDim("Logits");
auto lab_dims = ctx->GetInputDim("Labels");
auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
auto pred_grad_name = framework::GradVarName("Logits");
ctx->SetOutputDim(pred_grad_name, pred_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
hinge_loss_grad, ops::HingeLossGradOp);
REGISTER_OP_CPU_KERNEL(hinge_loss,
ops::HingeLossKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
hinge_loss_grad,
ops::HingeLossGradKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/hinge_loss_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(hinge_loss,
ops::HingeLossKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
hinge_loss_grad,
ops::HingeLossGradKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename Place, typename T, typename AttrType = T>
class HingeLossKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* pred = context.Input<framework::Tensor>("Logits");
auto* label = context.Input<framework::Tensor>("Labels");
auto* loss = context.Output<framework::Tensor>("Loss");
auto place = context.GetEigenDevice<Place>();
auto x = framework::EigenVector<T>::Flatten(*pred);
auto y = framework::EigenVector<T>::Flatten(*label);
loss->mutable_data<T>(context.GetPlace());
auto l = framework::EigenVector<T>::Flatten(*loss);
l.device(place) =
(static_cast<T>(1) - x * (static_cast<T>(2) * y - static_cast<T>(1)))
.cwiseMax(static_cast<T>(0));
}
};
template <typename Place, typename T, typename AttrType = T>
class HingeLossGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* pred = context.Input<framework::Tensor>("Logits");
auto* label = context.Input<framework::Tensor>("Labels");
auto* dloss =
context.Input<framework::Tensor>(framework::GradVarName("Loss"));
auto* dpred =
context.Output<framework::Tensor>(framework::GradVarName("Logits"));
auto place = context.GetEigenDevice<Place>();
auto x = framework::EigenVector<T>::Flatten(*pred);
auto y = framework::EigenVector<T>::Flatten(*label);
auto dl = framework::EigenVector<T>::Flatten(*dloss);
if (dpred) {
dpred->mutable_data<T>(context.GetPlace());
auto dx = framework::EigenVector<T>::Flatten(*dpred);
auto alt_labels = static_cast<T>(2) * y - static_cast<T>(1);
dx.device(place) =
dl * ((x * alt_labels) < static_cast<T>(1)).template cast<T>() *
(-alt_labels);
}
}
};
} // namespace operators
} // namespace paddle
......@@ -61,6 +61,8 @@ class IncrementOp : public framework::OperatorBase {
out.Resize(x.dims());
out.mutable_data(x.place(), x.type());
float value = Attr<float>("step");
VLOG(10) << Output("Out") << " increase " << Input("X") << " with "
<< value;
framework::VisitDataType(framework::ToDataType(out.type()),
IncrementFunctor(x, &out, value));
}
......
......@@ -14,6 +14,7 @@
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/safe_ref.h"
namespace paddle {
namespace operators {
......@@ -32,15 +33,20 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {
auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
auto &rank_table =
scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
auto &out =
*scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
Input("X"))
.Get<framework::LoDTensor>();
auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")))
.Get<framework::LoDRankTable>();
auto &out = *detail::Ref(scope.FindVar(Output("Out")))
.GetMutable<framework::LoDTensorArray>();
auto &items = rank_table.items();
auto max_seq_len = items[0].length;
auto rank_level = rank_table.level();
PADDLE_ENFORCE_LT(rank_level, x.lod().size(),
"Input should be a LOD tensor, and size is at least %d",
rank_level + 1);
out.resize(max_seq_len);
std::vector<std::vector<CopyRange>> copy_ranges(max_seq_len);
......@@ -55,16 +61,13 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
size_t start_idx = x.lod()[rank_level][item.index] + t;
auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
x.lod(), start_idx, start_idx + 1, rank_level + 1);
auto &lod_length = lod_and_offset.first;
framework::AppendLoD(&lod, lod_length);
size_t start_offset = lod_and_offset.second.first;
size_t end_offset = lod_and_offset.second.second;
copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
}
}
for (size_t i = 0; i < max_seq_len; ++i) {
auto &ranges = copy_ranges[i];
size_t height = std::accumulate(
......
......@@ -181,7 +181,7 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC(
Long-Short Term Memory (LSTM) Operator.
The defalut implementation is diagonal/peephole connection
The defalut implementation is diagonal/peephole connection
(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
$$
......@@ -198,27 +198,27 @@ c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
h_t = o_t \odot act_h(c_t)
$$
where the W terms denote weight matrices (e.g. \f$W_{xi}\f$ is the matrix
of weights from the input gate to the input), \f$W_{ic}, W_{fc}, W_{oc}\f$
where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
are diagonal weight matrices for peephole connections. In our implementation,
we use vectors to reprenset these diagonal weight matrices. The b terms
denote bias vectors (\f$b_i\f$ is the input gate bias vector), \f$\sigma\f$
denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
is the non-line activations, such as logistic sigmoid function, and
\f$i, f, o\f$ and \f$c\f$ are the input gate, forget gate, output gate,
$i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector \f$h\f$.
the cell output activation vector $h$.
The \f$\odot\f$ is the element-wise product of the vectors. \f$act_g\f$ and \f$act_h\f$
The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
are the cell input and cell output activation functions and `tanh` is usually
used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state,
used for them. $\tilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Set `use_peepholes` False to disable peephole connection
(http://www.bioinf.jku.at/publications/older/2604.pdf). The formula
is omitted here.
Set `use_peepholes` False to disable peephole connection. The formula
is omitted here, please refer to the paper
http://www.bioinf.jku.at/publications/older/2604.pdf for details.
Note that these \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
operations on the input \f$x_{t}\f$ are NOT included in this operator.
Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
operations on the input $x_{t}$ are NOT included in this operator.
Users can choose to use fully-connect operator before LSTM operator.
)DOC");
......
......@@ -73,15 +73,15 @@ class LSTMKernel : public framework::OpKernel<T> {
T* bias_data = const_cast<T*>(bias->data<T>());
// the code style in LstmMetaValue will be updated later.
lstm_value.checkIg = bias_data + 4 * frame_size;
lstm_value.checkFg = lstm_value.checkIg + frame_size;
lstm_value.checkOg = lstm_value.checkFg + frame_size;
lstm_value.check_ig = bias_data + 4 * frame_size;
lstm_value.check_fg = lstm_value.check_ig + frame_size;
lstm_value.check_og = lstm_value.check_fg + frame_size;
} else {
lstm_value.checkIg = nullptr;
lstm_value.checkFg = nullptr;
lstm_value.checkOg = nullptr;
lstm_value.check_ig = nullptr;
lstm_value.check_fg = nullptr;
lstm_value.check_og = nullptr;
}
lstm_value.prevStateValue = nullptr;
lstm_value.prev_state_value = nullptr;
Tensor ordered_c0;
const size_t* order = batch_gate->lod()[2].data();
if (cell_t0) {
......@@ -90,7 +90,7 @@ class LSTMKernel : public framework::OpKernel<T> {
// to reorder.
ReorderInitState<Place, T>(device_ctx, *cell_t0, order, &ordered_c0,
true);
lstm_value.prevStateValue = ordered_c0.data<T>();
lstm_value.prev_state_value = ordered_c0.data<T>();
}
// Use the local variable as here.
......@@ -140,14 +140,14 @@ class LSTMKernel : public framework::OpKernel<T> {
static_cast<T>(1.0));
}
lstm_value.gateValue = gate_t.data<T>();
lstm_value.outputValue = out_t.data<T>();
lstm_value.stateValue = cell_t.data<T>();
lstm_value.stateActiveValue = cell_pre_act_t.data<T>();
lstm_value.gate_value = gate_t.data<T>();
lstm_value.output_value = out_t.data<T>();
lstm_value.state_value = cell_t.data<T>();
lstm_value.state_active_value = cell_pre_act_t.data<T>();
math::LstmUnitFunctor<Place, T>::compute(device_ctx, lstm_value,
frame_size, cur_batch_size,
gate_act, cell_act, cand_act);
lstm_value.prevStateValue = lstm_value.stateValue;
lstm_value.prev_state_value = lstm_value.state_value;
}
math::Batch2LoDTensorFunctor<Place, T> to_seq;
......@@ -214,13 +214,13 @@ class LSTMGradKernel : public framework::OpKernel<T> {
math::LstmMetaValue<T> lstm_value;
if (bias && ctx.Attr<bool>("use_peepholes")) {
T* bias_data = const_cast<T*>(bias->data<T>());
lstm_value.checkIg = bias_data + 4 * frame_size;
lstm_value.checkFg = lstm_value.checkIg + frame_size;
lstm_value.checkOg = lstm_value.checkFg + frame_size;
lstm_value.check_ig = bias_data + 4 * frame_size;
lstm_value.check_fg = lstm_value.check_ig + frame_size;
lstm_value.check_og = lstm_value.check_fg + frame_size;
} else {
lstm_value.checkIg = nullptr;
lstm_value.checkFg = nullptr;
lstm_value.checkOg = nullptr;
lstm_value.check_ig = nullptr;
lstm_value.check_fg = nullptr;
lstm_value.check_og = nullptr;
}
math::LstmMetaGrad<T> lstm_grad;
......@@ -231,13 +231,13 @@ class LSTMGradKernel : public framework::OpKernel<T> {
}
if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
T* bias_g_data = bias_g->data<T>();
lstm_grad.checkIgGrad = bias_g_data + 4 * frame_size;
lstm_grad.checkFgGrad = lstm_grad.checkIgGrad + frame_size;
lstm_grad.checkOgGrad = lstm_grad.checkFgGrad + frame_size;
lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
} else {
lstm_grad.checkIgGrad = nullptr;
lstm_grad.checkFgGrad = nullptr;
lstm_grad.checkOgGrad = nullptr;
lstm_grad.check_ig_grad = nullptr;
lstm_grad.check_fg_grad = nullptr;
lstm_grad.check_og_grad = nullptr;
}
math::LoDTensor2BatchFunctor<Place, T> to_batch;
......@@ -276,26 +276,26 @@ class LSTMGradKernel : public framework::OpKernel<T> {
Tensor gate = batch_gate->Slice(bstart, bend);
Tensor cell = batch_cell.Slice(bstart, bend);
Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
lstm_value.gateValue = gate.data<T>();
lstm_value.stateValue = cell.data<T>();
lstm_value.stateActiveValue = cell_pre_act.data<T>();
lstm_value.gate_value = gate.data<T>();
lstm_value.state_value = cell.data<T>();
lstm_value.state_active_value = cell_pre_act.data<T>();
Tensor out_g = batch_hidden_g.Slice(bstart, bend);
Tensor gate_g = batch_gate_g.Slice(bstart, bend);
Tensor cell_g = batch_cell_g.Slice(bstart, bend);
lstm_grad.stateGrad = cell_g.data<T>();
lstm_grad.gateGrad = gate_g.data<T>();
lstm_grad.outputGrad = out_g.data<T>();
lstm_grad.state_grad = cell_g.data<T>();
lstm_grad.gate_grad = gate_g.data<T>();
lstm_grad.output_grad = out_g.data<T>();
if (n > 0) {
int bstart_pre = static_cast<int>(batch_starts[n - 1]);
Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
lstm_value.prevStateValue = cell_pre.data<T>();
lstm_grad.prevStateGrad = cell_pre_g.data<T>();
lstm_value.prev_state_value = cell_pre.data<T>();
lstm_grad.prev_state_grad = cell_pre_g.data<T>();
} else {
lstm_value.prevStateValue = c0 ? ordered_c0.data<T>() : nullptr;
lstm_grad.prevStateGrad = c0_g ? ordered_c0_g.data<T>() : nullptr;
lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
}
int cur_batch_size = bend - bstart;
......
......@@ -26,278 +26,284 @@ namespace detail {
template <class T, class Op>
void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frameSize,
int frame_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
T rValueIn;
T rValueIg;
T rValueFg;
T rValueOg;
T rCheckI;
T rCheckF;
T rCheckO;
T rState;
T rPrevState = 0;
T rStateAtv;
T rOut;
T *valueIn = value.gateValue;
T *valueIg = value.gateValue + frameSize;
T *valueFg = value.gateValue + frameSize * 2;
T *valueOg = value.gateValue + frameSize * 3;
for (int i = 0; i < frameSize; i++) {
rValueIn = valueIn[i];
rValueIg = valueIg[i];
rValueFg = valueFg[i];
rValueOg = valueOg[i];
rCheckI = value.checkIg ? value.checkIg[i] : 0;
rCheckF = value.checkFg ? value.checkFg[i] : 0;
rCheckO = value.checkOg ? value.checkOg[i] : 0;
if (value.prevStateValue) {
rPrevState = value.prevStateValue[i];
T r_value_in;
T r_value_ig;
T r_value_fg;
T r_value_og;
T r_checkI;
T r_checkF;
T r_checkO;
T r_state;
T r_prev_state = 0;
T r_state_atv;
T r_out;
T *value_in = value.gate_value;
T *value_ig = value.gate_value + frame_size;
T *value_fg = value.gate_value + frame_size * 2;
T *value_og = value.gate_value + frame_size * 3;
for (int i = 0; i < frame_size; i++) {
r_value_in = value_in[i];
r_value_ig = value_ig[i];
r_value_fg = value_fg[i];
r_value_og = value_og[i];
r_checkI = value.check_ig ? value.check_ig[i] : 0;
r_checkF = value.check_fg ? value.check_fg[i] : 0;
r_checkO = value.check_og ? value.check_og[i] : 0;
if (value.prev_state_value) {
r_prev_state = value.prev_state_value[i];
}
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state);
valueIn[i] = rValueIn;
valueIg[i] = rValueIg;
valueFg[i] = rValueFg;
valueOg[i] = rValueOg;
value.stateValue[i] = rState;
value.stateActiveValue[i] = rStateAtv;
value.outputValue[i] = rOut;
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
active_gate, active_state);
value_in[i] = r_value_in;
value_ig[i] = r_value_ig;
value_fg[i] = r_value_fg;
value_og[i] = r_value_og;
value.state_value[i] = r_state;
value.state_active_value[i] = r_state_atv;
value.output_value[i] = r_out;
}
}
template <class T, class Op>
void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize,
LstmMetaGrad<T> grad, int frame_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
T rValueIn;
T rValueIg;
T rValueFg;
T rValueOg;
T rGradIn;
T rGradIg;
T rGradFg;
T rGradOg;
T rPrevState = 0;
T rPrevStateGrad;
T rState;
T rStateGrad;
T rStateAtv;
T rOutputGrad;
T rCheckI;
T rCheckF;
T rCheckO;
T rCheckIGrad;
T rCheckFGrad;
T rCheckOGrad;
T *valueIn = value.gateValue;
T *valueIg = value.gateValue + frameSize;
T *valueFg = value.gateValue + frameSize * 2;
T *valueOg = value.gateValue + frameSize * 3;
T *gradIn = grad.gateGrad;
T *gradIg = grad.gateGrad + frameSize;
T *gradFg = grad.gateGrad + frameSize * 2;
T *gradOg = grad.gateGrad + frameSize * 3;
for (int i = 0; i < frameSize; i++) {
rValueIn = valueIn[i];
rValueIg = valueIg[i];
rValueFg = valueFg[i];
rValueOg = valueOg[i];
rCheckI = value.checkIg ? value.checkIg[i] : 0;
rCheckF = value.checkFg ? value.checkFg[i] : 0;
rCheckO = value.checkOg ? value.checkOg[i] : 0;
rState = value.stateValue[i];
rStateAtv = value.stateActiveValue[i];
rOutputGrad = grad.outputGrad[i];
rStateGrad = grad.stateGrad[i];
if (value.prevStateValue) {
rPrevState = value.prevStateValue[i];
T r_value_in;
T r_value_ig;
T r_value_fg;
T r_value_og;
T r_grad_in;
T r_grad_ig;
T r_grad_fg;
T r_grad_og;
T r_prev_state = 0;
T r_prev_state_grad;
T r_state;
T r_state_grad;
T r_state_atv;
T r_output_grad;
T r_checkI;
T r_checkF;
T r_checkO;
T r_checkIGrad;
T r_checkFGrad;
T r_checkOGrad;
T *value_in = value.gate_value;
T *value_ig = value.gate_value + frame_size;
T *value_fg = value.gate_value + frame_size * 2;
T *value_og = value.gate_value + frame_size * 3;
T *grad_in = grad.gate_grad;
T *grad_ig = grad.gate_grad + frame_size;
T *grad_fg = grad.gate_grad + frame_size * 2;
T *grad_og = grad.gate_grad + frame_size * 3;
for (int i = 0; i < frame_size; i++) {
r_value_in = value_in[i];
r_value_ig = value_ig[i];
r_value_fg = value_fg[i];
r_value_og = value_og[i];
r_checkI = value.check_ig ? value.check_ig[i] : 0;
r_checkF = value.check_fg ? value.check_fg[i] : 0;
r_checkO = value.check_og ? value.check_og[i] : 0;
r_state = value.state_value[i];
r_state_atv = value.state_active_value[i];
r_output_grad = grad.output_grad[i];
r_state_grad = grad.state_grad[i];
if (value.prev_state_value) {
r_prev_state = value.prev_state_value[i];
}
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
rCheckOGrad, active_node, active_gate, active_state);
gradIn[i] = rGradIn;
gradIg[i] = rGradIg;
gradFg[i] = rGradFg;
gradOg[i] = rGradOg;
grad.stateGrad[i] = rStateGrad;
if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad;
if (value.prevStateValue) {
if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad;
if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad;
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
active_state);
grad_in[i] = r_grad_in;
grad_ig[i] = r_grad_ig;
grad_fg[i] = r_grad_fg;
grad_og[i] = r_grad_og;
grad.state_grad[i] = r_state_grad;
if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad;
if (value.prev_state_value) {
if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad;
if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad;
}
if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad;
if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad;
}
}
template <class T, class Op>
void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize,
void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frame_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
#ifdef __AVX__
__m256 rValueIn;
__m256 rValueIg;
__m256 rValueFg;
__m256 rValueOg;
__m256 rCheckI = _mm256_set1_ps(0.0f);
__m256 rCheckF = _mm256_set1_ps(0.0f);
__m256 rCheckO = _mm256_set1_ps(0.0f);
__m256 rState;
__m256 rPrevState = _mm256_set1_ps(0.0f);
__m256 rStateAtv;
__m256 rOut;
__m256 *valueIn = (__m256 *)value.gateValue;
__m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
__m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
__m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
for (int i = 0; i < frameSize / 8; i++) {
rValueIn = valueIn[i];
rValueIg = valueIg[i];
rValueFg = valueFg[i];
rValueOg = valueOg[i];
if (value.checkIg) {
rCheckI = ((__m256 *)value.checkIg)[i];
rCheckF = ((__m256 *)value.checkFg)[i];
rCheckO = ((__m256 *)value.checkOg)[i];
__m256 r_value_in;
__m256 r_value_ig;
__m256 r_value_fg;
__m256 r_value_og;
__m256 r_checkI = _mm256_set1_ps(0.0f);
__m256 r_checkF = _mm256_set1_ps(0.0f);
__m256 r_checkO = _mm256_set1_ps(0.0f);
__m256 r_state;
__m256 r_prev_state = _mm256_set1_ps(0.0f);
__m256 r_state_atv;
__m256 r_out;
__m256 *value_in = (__m256 *)value.gate_value;
__m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
__m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
__m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
for (int i = 0; i < frame_size / 8; i++) {
r_value_in = value_in[i];
r_value_ig = value_ig[i];
r_value_fg = value_fg[i];
r_value_og = value_og[i];
if (value.check_ig) {
r_checkI = ((__m256 *)value.check_ig)[i];
r_checkF = ((__m256 *)value.check_fg)[i];
r_checkO = ((__m256 *)value.check_og)[i];
}
if (value.prevStateValue) {
rPrevState = ((__m256 *)value.prevStateValue)[i];
if (value.prev_state_value) {
r_prev_state = ((__m256 *)value.prev_state_value)[i];
}
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state);
valueIn[i] = rValueIn;
valueIg[i] = rValueIg;
valueFg[i] = rValueFg;
valueOg[i] = rValueOg;
((__m256 *)value.stateValue)[i] = rState;
((__m256 *)value.stateActiveValue)[i] = rStateAtv;
((__m256 *)value.outputValue)[i] = rOut;
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
active_gate, active_state);
value_in[i] = r_value_in;
value_ig[i] = r_value_ig;
value_fg[i] = r_value_fg;
value_og[i] = r_value_og;
((__m256 *)value.state_value)[i] = r_state;
((__m256 *)value.state_active_value)[i] = r_state_atv;
((__m256 *)value.output_value)[i] = r_out;
}
#endif
}
template <class T, class Op>
void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize,
LstmMetaGrad<T> grad, int frame_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
#ifdef __AVX__
__m256 rValueIn;
__m256 rValueIg;
__m256 rValueFg;
__m256 rValueOg;
__m256 rGradIn;
__m256 rGradIg;
__m256 rGradFg;
__m256 rGradOg;
__m256 rPrevState = _mm256_set1_ps(0.0f);
__m256 rPrevStateGrad;
__m256 rStateGrad;
__m256 rState;
__m256 rStateAtv;
__m256 rOutputGrad;
__m256 rCheckI = _mm256_set1_ps(0.0f);
__m256 rCheckF = _mm256_set1_ps(0.0f);
__m256 rCheckO = _mm256_set1_ps(0.0f);
__m256 rCheckIGrad;
__m256 rCheckFGrad;
__m256 rCheckOGrad;
__m256 *valueIn = (__m256 *)value.gateValue;
__m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
__m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
__m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
__m256 *gradIn = (__m256 *)grad.gateGrad;
__m256 *gradIg = (__m256 *)(grad.gateGrad + frameSize);
__m256 *gradFg = (__m256 *)(grad.gateGrad + frameSize * 2);
__m256 *gradOg = (__m256 *)(grad.gateGrad + frameSize * 3);
for (int i = 0; i < frameSize / 8; i++) {
rValueIn = valueIn[i];
rValueIg = valueIg[i];
rValueFg = valueFg[i];
rValueOg = valueOg[i];
if (value.checkIg) {
rCheckI = ((__m256 *)value.checkIg)[i];
rCheckF = ((__m256 *)value.checkFg)[i];
rCheckO = ((__m256 *)value.checkOg)[i];
__m256 r_value_in;
__m256 r_value_ig;
__m256 r_value_fg;
__m256 r_value_og;
__m256 r_grad_in;
__m256 r_grad_ig;
__m256 r_grad_fg;
__m256 r_grad_og;
__m256 r_prev_state = _mm256_set1_ps(0.0f);
__m256 r_prev_state_grad;
__m256 r_state_grad;
__m256 r_state;
__m256 r_state_atv;
__m256 r_output_grad;
__m256 r_checkI = _mm256_set1_ps(0.0f);
__m256 r_checkF = _mm256_set1_ps(0.0f);
__m256 r_checkO = _mm256_set1_ps(0.0f);
__m256 r_checkIGrad;
__m256 r_checkFGrad;
__m256 r_checkOGrad;
__m256 *value_in = (__m256 *)value.gate_value;
__m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
__m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
__m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
__m256 *grad_in = (__m256 *)grad.gate_grad;
__m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size);
__m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2);
__m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3);
for (int i = 0; i < frame_size / 8; i++) {
r_value_in = value_in[i];
r_value_ig = value_ig[i];
r_value_fg = value_fg[i];
r_value_og = value_og[i];
if (value.check_ig) {
r_checkI = ((__m256 *)value.check_ig)[i];
r_checkF = ((__m256 *)value.check_fg)[i];
r_checkO = ((__m256 *)value.check_og)[i];
}
rState = ((__m256 *)value.stateValue)[i];
rStateAtv = ((__m256 *)value.stateActiveValue)[i];
rOutputGrad = ((__m256 *)grad.outputGrad)[i];
rStateGrad = ((__m256 *)grad.stateGrad)[i];
if (value.prevStateValue) {
rPrevState = ((__m256 *)value.prevStateValue)[i];
r_state = ((__m256 *)value.state_value)[i];
r_state_atv = ((__m256 *)value.state_active_value)[i];
r_output_grad = ((__m256 *)grad.output_grad)[i];
r_state_grad = ((__m256 *)grad.state_grad)[i];
if (value.prev_state_value) {
r_prev_state = ((__m256 *)value.prev_state_value)[i];
}
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
rCheckOGrad, active_node, active_gate, active_state);
gradIn[i] = rGradIn;
gradIg[i] = rGradIg;
gradFg[i] = rGradFg;
gradOg[i] = rGradOg;
((__m256 *)grad.stateGrad)[i] = rStateGrad;
if (grad.prevStateGrad) ((__m256 *)grad.prevStateGrad)[i] = rPrevStateGrad;
if (value.prevStateValue) {
if (grad.checkIgGrad) ((__m256 *)grad.checkIgGrad)[i] += rCheckIGrad;
if (grad.checkFgGrad) ((__m256 *)grad.checkFgGrad)[i] += rCheckFGrad;
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
active_state);
grad_in[i] = r_grad_in;
grad_ig[i] = r_grad_ig;
grad_fg[i] = r_grad_fg;
grad_og[i] = r_grad_og;
((__m256 *)grad.state_grad)[i] = r_state_grad;
if (grad.prev_state_grad)
((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad;
if (value.prev_state_value) {
if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad;
if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad;
}
if (grad.checkOgGrad) ((__m256 *)grad.checkOgGrad)[i] += rCheckOGrad;
if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad;
}
#endif
}
template <class T, class Op>
void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frameSize,
void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
active_gate, active_state);
} else {
naive_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
active_gate, active_state);
}
}
template <class T, class Op>
void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frameSize, activation_mode_t active_node,
int frame_size, activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
active_gate, active_state);
} else {
naive_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
active_gate, active_state);
naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
active_node, active_gate, active_state);
}
}
......
......@@ -26,189 +26,192 @@ namespace math {
namespace detail {
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
* threads(frame_per_block, batch_per_block)
* grid(frame_blocks, batch_blocks)
*/
template <class T, class Op, bool isBatch>
__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frameSize,
int batchSize, activation_mode_t active_node,
template <class T, class Op, bool is_batch>
__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
int batch_size, activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
value.gateValue += batchIdx * frameSize * 4;
value.outputValue += batchIdx * frameSize;
value.stateValue += batchIdx * frameSize;
value.stateActiveValue += batchIdx * frameSize;
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) return;
value.gate_value += batch_idx * frame_size * 4;
value.output_value += batch_idx * frame_size;
value.state_value += batch_idx * frame_size;
value.state_active_value += batch_idx * frame_size;
}
T rState;
T rPrevState = 0;
T rStateAtv;
T rOut;
T rValueIn;
T rValueIg;
T rValueFg;
T rValueOg;
T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0;
T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0;
T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0;
rValueIn = value.gateValue[frameIdx];
rValueIg = value.gateValue[frameIdx + frameSize];
rValueFg = value.gateValue[frameIdx + frameSize * 2];
rValueOg = value.gateValue[frameIdx + frameSize * 3];
if (value.prevStateValue) {
if (isBatch) value.prevStateValue += batchIdx * frameSize;
rPrevState = value.prevStateValue[frameIdx];
T r_state;
T r_prev_state = 0;
T r_state_atv;
T r_out;
T r_value_in;
T r_value_ig;
T r_value_fg;
T r_value_og;
T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
r_value_in = value.gate_value[frame_idx];
r_value_ig = value.gate_value[frame_idx + frame_size];
r_value_fg = value.gate_value[frame_idx + frame_size * 2];
r_value_og = value.gate_value[frame_idx + frame_size * 3];
if (value.prev_state_value) {
if (is_batch) value.prev_state_value += batch_idx * frame_size;
r_prev_state = value.prev_state_value[frame_idx];
}
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state);
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate,
active_state);
value.gateValue[frameIdx] = rValueIn;
value.gateValue[frameIdx + frameSize] = rValueIg;
value.gateValue[frameIdx + frameSize * 2] = rValueFg;
value.gateValue[frameIdx + frameSize * 3] = rValueOg;
value.gate_value[frame_idx] = r_value_in;
value.gate_value[frame_idx + frame_size] = r_value_ig;
value.gate_value[frame_idx + frame_size * 2] = r_value_fg;
value.gate_value[frame_idx + frame_size * 3] = r_value_og;
value.stateValue[frameIdx] = rState;
value.stateActiveValue[frameIdx] = rStateAtv;
value.outputValue[frameIdx] = rOut;
value.state_value[frame_idx] = r_state;
value.state_active_value[frame_idx] = r_state_atv;
value.output_value[frame_idx] = r_out;
}
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
* threads(frame_per_block, batch_per_block)
* grid(frame_blocks, batch_blocks)
*/
template <class T, class Op, bool isBatch>
template <class T, class Op, bool is_batch>
__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize,
int batchSize, activation_mode_t active_node,
LstmMetaGrad<T> grad, int frame_size,
int batch_size, activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
value.gateValue += batchIdx * frameSize * 4;
value.stateValue += batchIdx * frameSize;
value.stateActiveValue += batchIdx * frameSize;
grad.gateGrad += batchIdx * frameSize * 4;
grad.stateGrad += batchIdx * frameSize;
grad.outputGrad += batchIdx * frameSize;
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) return;
value.gate_value += batch_idx * frame_size * 4;
value.state_value += batch_idx * frame_size;
value.state_active_value += batch_idx * frame_size;
grad.gate_grad += batch_idx * frame_size * 4;
grad.state_grad += batch_idx * frame_size;
grad.output_grad += batch_idx * frame_size;
}
T rValueIn;
T rValueIg;
T rValueFg;
T rValueOg;
T rGradIn;
T rGradIg;
T rGradFg;
T rGradOg;
T rPrevState = 0;
T rPrevStateGrad;
T rState;
T rStateGrad;
T rStateAtv;
T rOutputGrad;
T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0;
T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0;
T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0;
T rCheckIGrad;
T rCheckFGrad;
T rCheckOGrad;
rValueIn = value.gateValue[frameIdx];
rValueIg = value.gateValue[frameIdx + frameSize];
rValueFg = value.gateValue[frameIdx + frameSize * 2];
rValueOg = value.gateValue[frameIdx + frameSize * 3];
rState = value.stateValue[frameIdx];
rStateAtv = value.stateActiveValue[frameIdx];
rOutputGrad = grad.outputGrad[frameIdx];
rStateGrad = grad.stateGrad[frameIdx];
if (value.prevStateValue) {
if (isBatch) value.prevStateValue += batchIdx * frameSize;
rPrevState = value.prevStateValue[frameIdx];
T r_value_in;
T r_value_ig;
T r_value_fg;
T r_value_og;
T r_grad_in;
T r_grad_ig;
T r_grad_fg;
T r_grad_og;
T r_prev_state = 0;
T r_prev_state_grad;
T r_state;
T r_state_grad;
T r_state_atv;
T r_output_grad;
T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
T r_checkIGrad;
T r_checkFGrad;
T r_checkOGrad;
r_value_in = value.gate_value[frame_idx];
r_value_ig = value.gate_value[frame_idx + frame_size];
r_value_fg = value.gate_value[frame_idx + frame_size * 2];
r_value_og = value.gate_value[frame_idx + frame_size * 3];
r_state = value.state_value[frame_idx];
r_state_atv = value.state_active_value[frame_idx];
r_output_grad = grad.output_grad[frame_idx];
r_state_grad = grad.state_grad[frame_idx];
if (value.prev_state_value) {
if (is_batch) value.prev_state_value += batch_idx * frame_size;
r_prev_state = value.prev_state_value[frame_idx];
}
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg,
rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad,
rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad,
active_node, active_gate, active_state);
grad.gateGrad[frameIdx] = rGradIn;
grad.gateGrad[frameIdx + frameSize] = rGradIg;
grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
grad.stateGrad[frameIdx] = rStateGrad;
if (grad.prevStateGrad) {
if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
grad.prevStateGrad[frameIdx] = rPrevStateGrad;
op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
active_state);
grad.gate_grad[frame_idx] = r_grad_in;
grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
grad.gate_grad[frame_idx + frame_size * 2] = r_grad_fg;
grad.gate_grad[frame_idx + frame_size * 3] = r_grad_og;
grad.state_grad[frame_idx] = r_state_grad;
if (grad.prev_state_grad) {
if (is_batch) grad.prev_state_grad += batch_idx * frame_size;
grad.prev_state_grad[frame_idx] = r_prev_state_grad;
}
if (isBatch) {
if (value.prevStateValue) {
if (grad.checkIgGrad)
paddle::platform::CudaAtomicAdd(grad.checkIgGrad + frameIdx,
rCheckIGrad);
if (grad.checkFgGrad)
paddle::platform::CudaAtomicAdd(grad.checkFgGrad + frameIdx,
rCheckFGrad);
if (is_batch) {
if (value.prev_state_value) {
if (grad.check_ig_grad)
paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx,
r_checkIGrad);
if (grad.check_fg_grad)
paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx,
r_checkFGrad);
}
if (grad.checkOgGrad)
paddle::platform::CudaAtomicAdd(grad.checkOgGrad + frameIdx, rCheckOGrad);
if (grad.check_og_grad)
paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx,
r_checkOGrad);
} else {
if (value.prevStateValue) {
if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
if (value.prev_state_value) {
if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;
if (grad.check_fg_grad) grad.check_fg_grad[frame_idx] += r_checkFGrad;
}
if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
if (grad.check_og_grad) grad.check_og_grad[frame_idx] += r_checkOGrad;
}
}
template <class T, class Op>
void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, int frameSize, int batchSize,
LstmMetaValue<T> value, int frame_size, int batch_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
dim3 threads;
dim3 grid;
if (batchSize == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1);
grid = dim3(frameBlocks, 1);
if (batch_size == 1) {
int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
int frame_blocks = (frame_size + 1024 - 1) / 1024;
threads = dim3(frame_per_block, 1);
grid = dim3(frame_blocks, 1);
} else {
/* framePerBlock = 32 batchPerBlock = 32 */
/* frame_per_block = 32 batch_per_block = 32 */
threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
}
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) {
if (batch_size == 1) {
KeLstmForward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, active_node, active_gate,
/* is_batch= */ false><<<grid, threads, 0, stream>>>(
op, value, frame_size, batch_size, active_node, active_gate,
active_state);
} else {
KeLstmForward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, active_node, active_gate,
/* is_batch= */ true><<<grid, threads, 0, stream>>>(
op, value, frame_size, batch_size, active_node, active_gate,
active_state);
}
}
......@@ -216,34 +219,34 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
template <class T, class Op>
void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frameSize, int batchSize,
int frame_size, int batch_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
dim3 threads;
dim3 grid;
if (batchSize == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1);
grid = dim3(frameBlocks, 1);
if (batch_size == 1) {
int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
int frame_blocks = (frame_size + 1024 - 1) / 1024;
threads = dim3(frame_per_block, 1);
grid = dim3(frame_blocks, 1);
} else {
/* framePerBlock = 32 batchPerBlock = 16 */
/* frame_per_block = 32 batch_per_block = 16 */
threads = dim3(32, 16);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 16 - 1) / 16);
grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
}
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) {
if (batch_size == 1) {
KeLstmBackward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, active_node, active_gate,
/* is_batch= */ false><<<grid, threads, 0, stream>>>(
op, value, grad, frame_size, batch_size, active_node, active_gate,
active_state);
} else {
KeLstmBackward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, active_node, active_gate,
/* is_batch= */ true><<<grid, threads, 0, stream>>>(
op, value, grad, frame_size, batch_size, active_node, active_gate,
active_state);
}
}
......
......@@ -27,19 +27,19 @@ namespace forward {
template <class T>
class lstm {
public:
HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
T &prevState, T &state, T &stateAtv, T &output,
HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
T &prev_state, T &state, T &state_atv, T &output,
T &checkI, T &checkF, T &checkO,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
valueIn = activation(valueIn, active_node);
valueIg = activation(valueIg + prevState * checkI, active_gate);
valueFg = activation(valueFg + prevState * checkF, active_gate);
state = valueIn * valueIg + prevState * valueFg;
valueOg = activation(valueOg + state * checkO, active_gate);
stateAtv = activation(state, active_state);
output = valueOg * stateAtv;
value_in = activation(value_in, active_node);
value_ig = activation(value_ig + prev_state * checkI, active_gate);
value_fg = activation(value_fg + prev_state * checkF, active_gate);
state = value_in * value_ig + prev_state * value_fg;
value_og = activation(value_og + state * checkO, active_gate);
state_atv = activation(state, active_state);
output = value_og * state_atv;
}
#ifndef __NVCC__
#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default
......@@ -48,24 +48,27 @@ class lstm {
// Only float support AVX optimization
static const bool avx = std::is_same<T, float>::value;
HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg,
__m256 &valueOg, __m256 &prevState, __m256 &state,
__m256 &stateAtv, __m256 &output, __m256 &checkI,
HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig,
__m256 &value_fg, __m256 &value_og,
__m256 &prev_state, __m256 &state,
__m256 &state_atv, __m256 &output, __m256 &checkI,
__m256 &checkF, __m256 &checkO,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
valueIn = activation(valueIn, active_node);
valueIg = activation(
_mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)), active_gate);
valueFg = activation(
_mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)), active_gate);
state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg),
_mm256_mul_ps(prevState, valueFg));
valueOg = activation(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)),
active_gate);
stateAtv = activation(state, active_state);
output = _mm256_mul_ps(valueOg, stateAtv);
value_in = activation(value_in, active_node);
value_ig =
activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)),
active_gate);
value_fg =
activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)),
active_gate);
state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig),
_mm256_mul_ps(prev_state, value_fg));
value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)),
active_gate);
state_atv = activation(state, active_state);
output = _mm256_mul_ps(value_og, state_atv);
}
#endif
#endif
......@@ -78,25 +81,26 @@ namespace backward {
template <class T>
class lstm {
public:
HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
T &gradIn, T &gradIg, T &gradFg, T &gradOg,
T &prevState, T &prevStateGrad, T &state,
T &stateGrad, T &stateAtv, T &outputGrad,
HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
T &grad_in, T &grad_ig, T &grad_fg, T &grad_og,
T &prev_state, T &prev_state_grad, T &state,
T &state_grad, T &state_atv, T &output_grad,
T &checkI, T &checkF, T &checkO, T &checkIGrad,
T &checkFGrad, T &checkOGrad,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
gradOg = activation(outputGrad * stateAtv, valueOg, active_gate);
stateGrad += activation(outputGrad * valueOg, stateAtv, active_state) +
gradOg * checkO;
gradIn = activation(stateGrad * valueIg, valueIn, active_node);
gradIg = activation(stateGrad * valueIn, valueIg, active_gate);
gradFg = activation(stateGrad * prevState, valueFg, active_gate);
prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg;
checkIGrad = gradIg * prevState;
checkFGrad = gradFg * prevState;
checkOGrad = gradOg * state;
grad_og = activation(output_grad * state_atv, value_og, active_gate);
state_grad += activation(output_grad * value_og, state_atv, active_state) +
grad_og * checkO;
grad_in = activation(state_grad * value_ig, value_in, active_node);
grad_ig = activation(state_grad * value_in, value_ig, active_gate);
grad_fg = activation(state_grad * prev_state, value_fg, active_gate);
prev_state_grad =
grad_ig * checkI + grad_fg * checkF + state_grad * value_fg;
checkIGrad = grad_ig * prev_state;
checkFGrad = grad_fg * prev_state;
checkOGrad = grad_og * state;
}
#ifndef __NVCC__
#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default
......@@ -105,32 +109,32 @@ class lstm {
// Only float support AVX optimization
static const bool avx = std::is_same<T, float>::value;
HOSTDEVICE void operator()(
__m256 &valueIn, __m256 &valueIg, __m256 &valueFg, __m256 &valueOg,
__m256 &gradIn, __m256 &gradIg, __m256 &gradFg, __m256 &gradOg,
__m256 &prevState, __m256 &prevStateGrad, __m256 &state,
__m256 &stateGrad, __m256 &stateAtv, __m256 &outputGrad, __m256 &checkI,
__m256 &checkF, __m256 &checkO, __m256 &checkIGrad, __m256 &checkFGrad,
__m256 &checkOGrad, activation_mode_t active_node,
__m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og,
__m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og,
__m256 &prev_state, __m256 &prev_state_grad, __m256 &state,
__m256 &state_grad, __m256 &state_atv, __m256 &output_grad,
__m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad,
__m256 &checkFGrad, __m256 &checkOGrad, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_state) {
gradOg =
activation(_mm256_mul_ps(outputGrad, stateAtv), valueOg, active_gate);
stateGrad = _mm256_add_ps(
activation(_mm256_mul_ps(outputGrad, valueOg), stateAtv, active_state),
stateGrad);
stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad);
gradIn =
activation(_mm256_mul_ps(stateGrad, valueIg), valueIn, active_node);
gradIg =
activation(_mm256_mul_ps(stateGrad, valueIn), valueIg, active_gate);
gradFg =
activation(_mm256_mul_ps(stateGrad, prevState), valueFg, active_gate);
prevStateGrad = _mm256_add_ps(_mm256_mul_ps(gradIg, checkI),
_mm256_mul_ps(gradFg, checkF));
prevStateGrad =
_mm256_add_ps(_mm256_mul_ps(stateGrad, valueFg), prevStateGrad);
checkIGrad = _mm256_mul_ps(gradIg, prevState);
checkFGrad = _mm256_mul_ps(gradFg, prevState);
checkOGrad = _mm256_mul_ps(gradOg, state);
grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og,
active_gate);
state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og),
state_atv, active_state),
state_grad);
state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad);
grad_in =
activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node);
grad_ig =
activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate);
grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg,
active_gate);
prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI),
_mm256_mul_ps(grad_fg, checkF));
prev_state_grad =
_mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad);
checkIGrad = _mm256_mul_ps(grad_ig, prev_state);
checkFGrad = _mm256_mul_ps(grad_fg, prev_state);
checkOGrad = _mm256_mul_ps(grad_og, state);
}
#endif
#endif
......
......@@ -30,12 +30,12 @@ struct LstmUnitFunctor<platform::CPUPlace, T> {
detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
ActiveType(cand_act), ActiveType(gate_act),
ActiveType(cell_act));
value.gateValue += frame_size * 4;
value.stateValue += frame_size;
value.stateActiveValue += frame_size;
value.outputValue += frame_size;
if (value.prevStateValue) {
value.prevStateValue += frame_size;
value.gate_value += frame_size * 4;
value.state_value += frame_size;
value.state_active_value += frame_size;
value.output_value += frame_size;
if (value.prev_state_value) {
value.prev_state_value += frame_size;
}
}
}
......@@ -53,20 +53,20 @@ struct LstmUnitGradFunctor<platform::CPUPlace, T> {
frame_size, ActiveType(cand_act),
ActiveType(gate_act), ActiveType(cell_act));
value.gateValue += frame_size * 4;
value.stateValue += frame_size;
value.stateActiveValue += frame_size;
value.outputValue += frame_size;
if (value.prevStateValue) {
value.prevStateValue += frame_size;
value.gate_value += frame_size * 4;
value.state_value += frame_size;
value.state_active_value += frame_size;
value.output_value += frame_size;
if (value.prev_state_value) {
value.prev_state_value += frame_size;
}
grad.gateGrad += frame_size * 4;
grad.stateGrad += frame_size;
grad.stateActiveGrad += frame_size;
grad.outputGrad += frame_size;
if (grad.prevStateGrad) {
grad.prevStateGrad += frame_size;
grad.gate_grad += frame_size * 4;
grad.state_grad += frame_size;
grad.state_active_grad += frame_size;
grad.output_grad += frame_size;
if (grad.prev_state_grad) {
grad.prev_state_grad += frame_size;
}
}
}
......
......@@ -31,26 +31,26 @@ typedef enum {
template <class T>
struct LstmMetaValue {
T *gateValue;
T *prevStateValue;
T *stateValue;
T *stateActiveValue;
T *outputValue;
T *checkIg;
T *checkFg;
T *checkOg;
T *gate_value;
T *prev_state_value;
T *state_value;
T *state_active_value;
T *output_value;
T *check_ig;
T *check_fg;
T *check_og;
};
template <class T>
struct LstmMetaGrad {
T *gateGrad;
T *prevStateGrad;
T *stateGrad;
T *stateActiveGrad;
T *outputGrad;
T *checkIgGrad;
T *checkFgGrad;
T *checkOgGrad;
T *gate_grad;
T *prev_state_grad;
T *state_grad;
T *state_active_grad;
T *output_grad;
T *check_ig_grad;
T *check_fg_grad;
T *check_og_grad;
};
inline activation_mode_t ActiveType(const std::string &type) {
......
......@@ -99,13 +99,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
"Output(X@Grad) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should not be null.");
std::vector<framework::DDim> d_ins;
auto ins = ctx->GetInputsDim("X");
// No need to compute gradient for Input(Ids)
for (size_t i = 0; i < ins.size(); i++) {
d_ins.push_back(ins[i]);
}
ctx->SetOutputsDim(framework::GradVarName("X"), d_ins);
ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
}
protected:
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/nce_op.h"
namespace paddle {
namespace operators {
using framework::Tensor;
class NCEOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"));
PADDLE_ENFORCE(ctx->HasInput("Label"));
PADDLE_ENFORCE(ctx->HasInput("Weight"));
PADDLE_ENFORCE(ctx->HasOutput("Cost"));
PADDLE_ENFORCE(ctx->HasOutput("SampleLogits"));
PADDLE_ENFORCE(ctx->HasOutput("SampleLabels"));
auto x_dims = ctx->GetInputDim("Input");
auto label_dims = ctx->GetInputDim("Label");
PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
if (ctx->HasInput("Bias")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0],
ctx->GetInputDim("Bias")[0]);
}
auto num_neg_samples = ctx->Attrs().Get<int>("num_neg_samples");
auto num_total_classes = ctx->Attrs().Get<int>("num_total_classes");
std::vector<int> custom_neg_classes =
ctx->Attrs().Get<std::vector<int>>("custom_neg_classes");
PADDLE_ENFORCE_EQ(num_total_classes, ctx->GetInputDim("Weight")[0]);
if (custom_neg_classes.size() > 0) {
PADDLE_ENFORCE_EQ(custom_neg_classes.size(),
static_cast<size_t>(num_neg_samples));
}
// set dims of output(Out)
std::vector<int64_t> out_dims;
out_dims.push_back(x_dims[0]);
out_dims.push_back(1);
ctx->SetOutputDim("Cost", framework::make_ddim(out_dims));
// set dims of output(SampleOut)
std::vector<int64_t> sample_out_dims;
sample_out_dims.push_back(x_dims[0]);
sample_out_dims.push_back(num_neg_samples + num_true_classes);
ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims));
ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims));
}
protected:
framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
ctx.device_context());
}
};
class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
public:
NCEOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
AddInput(
"Label",
"(Tensor) A tensor of shape [batch_size, num_true_class]. "
"'num_true_class' is the number of target classes in each sample."
"The number of target classes per sample should be same. "
"If you have a variable number of target classes, "
"you can pad them out to a constant number by either repeating them"
" or by padding with an otherwise unused class.)");
AddInput("Weight",
"(Tensor) A tensor of shape [num_class, dim]. 'num_class' is the "
"total number of class.");
AddInput(
"Bias",
"(Tensor) A tensor of shape [num_class, 1]. 'num_class' is the total "
"number of class. It is a dispensable input.")
.AsDispensable();
AddInput("SampleWeight",
"(Tensor) A tensor of shape [batch_size, 1] storing a weight for "
"each sample. And it is a dispensable input. The default value of "
"sample is 1.")
.AsDispensable();
AddOutput("Cost",
"(Tensor) A tensor of shape [batch_size, 1]. Cost of samples.");
AddOutput("SampleLogits",
"An intermediate tensor of shape[batch_size, num_neg_samples + "
"num_pos_samples]."
"This tensor is output of forward kernel and used in backward "
"kernel to compute grads."
"Given X is the dot product of input tensor and sampled labels' "
"weights."
"Then 'SampleLogits' is sigmoid(X).")
.AsIntermediate();
AddOutput("SampleLabels",
"An intermediate tensor of shape[batch_size, num_neg_samples + "
"num_pos_samples]."
"This tensor is output of forward kernel and used in backward "
"kernel to compute grads."
"")
.AsIntermediate();
AddAttr<int>("num_total_classes",
"Total number of classes in all samples.");
AddAttr<int>("num_neg_samples",
"The number of negative classes. The default value is 10.")
.SetDefault(10);
AddAttr<std::vector<int>>("custom_neg_classes",
"This attribute only be used in unitest. Classes "
"in this list wiil be used as negative classes "
"for every samples. Under normal conditions, "
"user should avoid setting this attribute.");
AddComment(R"DOC(
Compute and return the noise-contrastive estimation training loss.
See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
By default this operator uses a uniform distribution for sampling.
)DOC");
}
};
class NCEOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"));
PADDLE_ENFORCE(ctx->HasInput("Weight"));
PADDLE_ENFORCE(ctx->HasInput("Cost"));
PADDLE_ENFORCE(ctx->HasInput("SampleLogits"));
PADDLE_ENFORCE(ctx->HasInput("SampleLabels"));
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cost")),
"The input(Out@GRAD) should not be null.");
auto x_dims = ctx->GetInputDim("Input");
auto x_grad_name = framework::GradVarName("Input");
if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims);
}
auto w_dims = ctx->GetInputDim("Weight");
auto w_grad_name = framework::GradVarName("Weight");
if (ctx->HasOutput(w_grad_name)) {
ctx->SetOutputDim(w_grad_name, w_dims);
}
auto bias_grad_name = framework::GradVarName("Bias");
if (ctx->HasOutput(bias_grad_name)) {
auto bias_dims = ctx->GetInputDim("Bias");
ctx->SetOutputDim(bias_grad_name, bias_dims);
}
}
protected:
framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
ctx.device_context());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
ops::NCEKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(nce_grad,
ops::NCEGradKernel<paddle::platform::CPUPlace, float>,
ops::NCEGradKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <math.h>
#include <random>
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename Place, typename T>
void PrepareSamples(const framework::ExecutionContext& context) {
auto label = context.Input<Tensor>("Label");
const int64_t* label_data = label->data<int64_t>();
auto label_dims = label->dims();
int num_total_classes = context.Attr<int>("num_total_classes");
// for unitest
std::vector<int> custom_neg_classes =
context.Attr<std::vector<int>>("custom_neg_classes");
// random machine
std::random_device rd;
std::mt19937 rng(rd());
std::uniform_int_distribution<int> rand(0, num_total_classes - 1);
auto sample_labels = context.Output<Tensor>("SampleLabels");
auto sample_labels_dims = sample_labels->dims();
int64_t* sample_labels_data =
sample_labels->mutable_data<int64_t>(context.GetPlace());
int num_label = label_dims.size() == 2 ? label_dims[1] : 1;
int index = 0;
for (size_t i = 0; i < label_dims[0]; ++i) {
int j = 0;
for (; j < num_label; ++j) {
sample_labels_data[index++] = label_data[i * num_label + j];
}
if (custom_neg_classes.size() > 0) {
for (auto label : custom_neg_classes) {
sample_labels_data[index++] = label;
}
} else {
for (; j < sample_labels_dims[1]; ++j) {
// TODO(wanghaoshuang): support more distribution sampling
sample_labels_data[index++] = rand(rng);
}
}
}
}
template <typename Place, typename T>
class NCEKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PrepareSamples<Place, T>(context);
auto sample_labels = context.Output<Tensor>("SampleLabels");
const int64_t* sample_labels_data = sample_labels->data<int64_t>();
auto sample_out = context.Output<Tensor>("SampleLogits");
T* sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
auto label = context.Input<Tensor>("Label");
auto sample_weight = context.Input<Tensor>("SampleWeight");
const T* sample_weight_data = nullptr;
if (sample_weight != nullptr) {
sample_weight_data = sample_weight->data<T>();
}
auto out = context.Output<Tensor>("Cost");
T* out_data = out->mutable_data<T>(context.GetPlace());
int num_neg_samples = context.Attr<int>("num_neg_samples");
int num_total_classes = context.Attr<int>("num_total_classes");
int num_true_class = 1;
if (label != nullptr) {
num_true_class = label->dims()[1];
}
T b = 1. / num_total_classes * num_neg_samples;
// forward bias
auto bias = context.Input<Tensor>("Bias");
if (bias != nullptr) {
const T* bias_data = bias->data<T>();
for (size_t i = 0; i < sample_labels->numel(); ++i) {
sample_out_data[i] = bias_data[sample_labels_data[i]];
}
} else {
for (size_t i = 0; i < sample_labels->numel(); ++i) {
sample_out_data[i] = 0;
}
}
// forward mul
auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
for (size_t i = 0; i < sample_labels->numel(); ++i) {
Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
(input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
weight_mat.chip(sample_labels_data[i], 0))
.sum();
sample_out_data[i] += result(0);
sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
}
// forward cost
for (size_t i = 0; i < sample_labels->dims()[0]; ++i) {
size_t j = 0;
out_data[i] = 0;
T w = sample_weight == nullptr ? 1. : sample_weight_data[i];
// for true classes
for (; j < num_true_class; ++j) {
T o = sample_out_data[i * sample_out->dims()[1] + j];
T cost = -log(o / (o + b));
out_data[i] += w * cost;
}
// for sampled neg classes
for (; j < sample_labels->dims()[1]; ++j) {
T o = sample_out_data[i * sample_out->dims()[1] + j];
T cost = -log(b / (o + b));
out_data[i] += w * cost;
}
}
}
};
template <typename Place, typename T>
class NCEGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto d_out = context.Input<Tensor>(framework::GradVarName("Cost"));
const T* d_out_data = d_out->data<T>();
auto label = context.Input<Tensor>("Label");
auto sample_out = context.Input<Tensor>("SampleLogits");
const T* sample_out_data = sample_out->data<T>();
auto sample_labels = context.Input<Tensor>("SampleLabels");
const int64_t* sample_labels_data = sample_labels->data<int64_t>();
auto sample_weight = context.Input<Tensor>("SampleWeight");
const T* sample_weight_data = nullptr;
if (sample_weight != nullptr) {
sample_weight_data = sample_weight->data<T>();
}
int num_neg_samples = context.Attr<int>("num_neg_samples");
int num_total_classes = context.Attr<int>("num_total_classes");
int num_true_class = 1;
if (label != nullptr) {
num_true_class = label->dims()[1];
}
T b = 1. / num_total_classes * num_neg_samples;
Tensor sample_grad; // tmp tensor
T* sample_grad_data =
sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
// backward cost
for (size_t i = 0; i < sample_labels->numel(); ++i) {
T o = sample_out_data[i];
T w = sample_weight == nullptr
? 1
: sample_weight_data[i / sample_labels->dims()[1]];
sample_grad_data[i] = (i % sample_labels->dims()[1]) < num_true_class
? w * (b / (o + b)) * (o - 1)
: w * (o * (1 - o) / (o + b));
sample_grad_data[i] *= d_out_data[i / sample_labels->dims()[1]];
}
// get d_bias
auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
if (d_bias != nullptr) {
T* d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
for (size_t i = 0; i < sample_labels->numel(); ++i) {
d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
}
}
// get d_w
auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
if (d_w != nullptr) {
auto d_w_data = d_w->mutable_data<T>(context.GetPlace());
std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
auto d_w_matrix = EigenMatrix<T>::From(*d_w);
auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
for (size_t i = 0; i < sample_labels->numel(); ++i) {
d_w_matrix.chip(sample_labels_data[i], 0) +=
x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
sample_grad_data[i];
}
}
// get d_x
auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
if (d_x != nullptr) {
d_x->mutable_data<T>(context.GetPlace());
auto d_x_matrix = EigenMatrix<T>::From(*d_x);
auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
for (size_t i = 0; i < sample_labels->numel(); ++i) {
d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -4,7 +4,7 @@
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
......@@ -35,9 +35,10 @@ class RankLossOp : public framework::OperatorWithKernel {
auto right_dims = ctx->GetInputDim("Right");
PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
"All inputs must have the same size");
PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1),
"All inputs must be row vector with size batch_size x 1.");
"All inputs must have the same size.");
PADDLE_ENFORCE(
(label_dims.size() == 2) && (label_dims[1] == 1),
"All inputs must be 2-D tensors with shape [batch_size x 1].");
ctx->SetOutputDim("Out", label_dims);
}
};
......@@ -48,10 +49,17 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Label",
"The label indicating A ranked higher than B or not, row vector.");
AddInput("Left", "The output of RankNet for doc A, vector.");
AddInput("Right", "The output of RankNet for doc B, vetor.");
AddOutput("Out", "The output loss of RankLoss operator, vector.");
"(2-D Tensor with shape [batch_size x 1]) "
"The label indicating A ranked higher than B or not.");
AddInput("Left",
"(2-D Tensor with shape [batch_size x 1]) "
"The output of RankNet for doc A.");
AddInput("Right",
"(2-D Tensor with shape [batch_size x 1]) "
"The output of RankNet for doc B.");
AddOutput("Out",
"(2-D Tensor with shape [batch_size x 1]) "
"The output loss of RankLoss operator.");
AddComment(R"DOC(
RankLoss Operator.
......@@ -65,16 +73,17 @@ P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
the input pair.
The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
(P_{i,j}), which represent the output of RankNet for the two docs and the label,
respectively, and yields the rank loss C_{i,j} using the following equation:
(P_{i,j}), which represent the output score of RankNet for the two docs and
the label respectively, and yields the rank loss C_{i,j} using the following
equation:
\f$$
C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
$$
C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
o_{i,j} = o_i - o_j \\
\tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
\f$$
$$
The operator can take inputs of one sample or in batch.
The operator can take batch inputs with size batch_size (batch_size >= 1).
)DOC");
}
......
......@@ -4,7 +4,7 @@
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
......
......@@ -4,7 +4,7 @@
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
......
......@@ -599,7 +599,9 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
std::vector<std::string> output{kOutputs};
for (auto &s : input) {
PADDLE_ENFORCE(ctx->HasInputs(s));
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)));
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
"Cannot find the gradient variable %s",
framework::GradVarName(s));
}
for (auto &s : output) {
PADDLE_ENFORCE(ctx->HasInputs(s));
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
......@@ -38,8 +37,8 @@ class ReshapeOp : public framework::OperatorWithKernel {
// TODO(qiao) change batch_size
for (size_t i = 1; i < shape.size(); ++i) {
PADDLE_ENFORCE(shape[i] > 0,
"Each dimension of shape "
"must be positiv except the first.");
"Each dimension of Attr(shape) "
"must be positive except the first one.");
}
if (shape[0] < 0) {
shape[0] = x_dims[0];
......
......@@ -4,7 +4,7 @@
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
......
......@@ -4,7 +4,7 @@
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
......
......@@ -104,6 +104,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
}
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("X", framework::GradVarName("X"));
}
protected:
......
......@@ -54,10 +54,10 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
PADDLE_ENFORCE_EQ(
n, static_cast<size_t>(length->dims()[0]),
"The size of input-sequence and length-array should be the same")
"The size of input-sequence and length-array should be the same");
PADDLE_ENFORCE_EQ(
n, static_cast<size_t>(offset->dims()[0]),
"The size of input-sequence and offset-array should be the same")
"The size of input-sequence and offset-array should be the same");
const int64_t* offset_data = offset->data<int64_t>();
const int64_t* length_data = length->data<int64_t>();
......@@ -78,11 +78,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < n; ++i) {
PADDLE_ENFORCE_LT(0, offset_data[i],
"The offset[%d] must greater than zero.", i)
"The offset[%d] must greater than zero.", i);
PADDLE_ENFORCE_LT(0, length_data[i],
"The length[%d] must greater than zero.", i)
"The length[%d] must greater than zero.", i);
PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
lod[0][i + 1], "The target tensor's length overflow.")
lod[0][i + 1], "The target tensor's length overflow.");
}
out->mutable_data<T>(ctx.GetPlace());
......
......@@ -25,20 +25,19 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Labels"),
"Input(Labels) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
auto x_dims = ctx->GetInputDim("X");
auto labels_dims = ctx->GetInputDim("Labels");
auto labels_dims = ctx->GetInputDim("Label");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
"Input(Labels)'s rank should be 2.");
"Input(Label)'s rank should be 2.");
PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
"The 1st dimension of Input(X) and Input(Labels) should "
"The 1st dimension of Input(X) and Input(Label) should "
"be equal.");
PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
"The 2nd dimension of Input(X) and Input(Labels) should "
"The 2nd dimension of Input(X) and Input(Label) should "
"be equal.");
ctx->SetOutputDim("Out", x_dims);
......@@ -53,26 +52,25 @@ class SigmoidCrossEntropyWithLogitsGradOp
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Labels"),
"Input(Labels) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shoudl be not null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@GRAD) should be not null.");
auto x_dims = ctx->GetInputDim("X");
auto labels_dims = ctx->GetInputDim("Labels");
auto labels_dims = ctx->GetInputDim("Label");
auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
"Input(Labels)'s rank should be 2.");
"Input(Label)'s rank should be 2.");
PADDLE_ENFORCE_EQ(dout_dims.size(), 2,
"Input(Out@Grad)'s rank should be 2.");
PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
"The 1st dimension of Input(X) and Input(Labels) should "
"The 1st dimension of Input(X) and Input(Label) should "
"be equal.");
PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
"The 2nd dimension of Input(X) and Input(Labels) should "
"The 2nd dimension of Input(X) and Input(Label) should "
"be equal.");
PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0],
"The 1st dimension of Input(X) and Input(Out@Grad) "
......@@ -97,7 +95,7 @@ class SigmoidCrossEntropyWithLogitsOpMaker
"This input is a tensor of logits computed by the previous "
" operator. Logits are unscaled log probabilities given as "
"log(p/(1-p)).");
AddInput("Labels",
AddInput("Label",
"(Tensor, default Tensor<float>), a 2-D tensor of the same type "
"and shape as X. This input is a tensor of probabalistic labels "
"for each logit");
......
......@@ -25,8 +25,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
const framework::Tensor *X = context.Input<framework::Tensor>("X");
const framework::Tensor *Labels =
context.Input<framework::Tensor>("Labels");
const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
framework::Tensor *Out = context.Output<framework::Tensor>("Out");
Out->mutable_data<T>(context.GetPlace());
......@@ -52,8 +51,7 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
const framework::Tensor *X = context.Input<framework::Tensor>("X");
const framework::Tensor *Labels =
context.Input<framework::Tensor>("Labels");
const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
const framework::Tensor *dOut =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
framework::Tensor *dX =
......
......@@ -22,22 +22,20 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
PADDLE_ENFORCE_EQ(x_dims, y_dims);
PADDLE_ENFORCE_GE(x_dims.size(), 2,
"The tensor rank of X must be at least 2.");
"The tensor rank of Input(X) should not be less than 2.");
if (ctx->HasInput("InsideWeight")) {
PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
"If weights are provided, must specify both "
"inside and outside weights.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims,
"The shape of InsideWeight must be same as X.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims,
"The shape of OutsideWeight must be same as X.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
}
ctx->SetOutputDim("Diff", x_dims);
......@@ -53,25 +51,29 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"The input tensor of smooth l1 loss op."
"The rank should be greater or equal to 2 with shape "
"[batch_size, value_dim1, value_dim2, ..., value_dimN]");
"(Tensor, default Tensor<float>) A tensor with rank at least 2. "
"The input value of smooth l1 loss op with shape "
"[batch_size, dim1, ..., dimN].");
AddInput("Y",
"The target tensor of smooth l1 loss op "
"with the same shape as X.");
"(Tensor, default Tensor<float>) A tensor with rank at least 2. "
"The target value of smooth l1 loss op with same shape as X.");
AddInput("InsideWeight",
"Optional input tensor of smooth l1 loss op with the same shape "
"as X. If provided, the result of (X - Y) will be multiplied "
"(Tensor, default Tensor<float>) A tensor with rank at least 2. "
"This input is optional and should have same shape with X. "
"If provided, the result of (X - Y) will be multiplied "
"by this tensor element by element.")
.AsDispensable();
AddInput("OutsideWeight",
"Optinal input of smooth l1 loss op with the same shape as X."
"If provided, the output smooth l1 loss will be multiplied by "
"this tensor element by element.")
"(Tensor, default Tensor<float>) A tensor with rank at least 2. "
"This input is optional and should have same shape with X. "
"If provided, the out smooth l1 loss will be multiplied by this "
"tensor element by element.")
.AsDispensable();
AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).")
AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
.AsIntermediate();
AddOutput("Out", "Smooth l1 loss.");
AddOutput("Out",
"(Tensor, default Tensor<float>) A tensor with rank be 2. "
"The output smooth l1 loss with shape [batch_size, 1].");
AddAttr<AttrType>("sigma",
"Hyper parameter of smooth l1 loss op."
"A float scalar with default value 3.0.")
......@@ -79,15 +81,23 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC(
Smooth L1 Loss Operator.
This operator computes the smooth l1 loss for input and target.
The operator takes the first dimension of input as the batch size.
This operator computes the smooth l1 loss for X and Y.
The operator takes the first dimension of X and Y as batch size.
For each instance, it computes the smooth l1 loss element by element first
and then sums all the losses. So the resulting output shape
is [batch_size, 1].
and then sums all the losses. So the shape of Out is [batch_size, 1].
The equation is:
loss = $$0.5 * (\sigma * (x-y))^2$$ if $$|x - y| < 1 /({\sigma}^2)$$
$$\frac{|x - y| - 0.5}{{\sigma}^2}$$ otherwise
$$
Out_{\sigma}(X, Y)_i = \begin{cases}
0.5 * (\sigma * (X_i - Y_i)) ^ 2
\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
\quad otherwise
\end{cases}
$$
In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
element of Out, X and Y.
)DOC");
}
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/softmax_with_cross_entropy_op.h"
#include <paddle/function/TensorType.h>
namespace paddle {
namespace operators {
......
......@@ -37,10 +37,16 @@ class SumOp : public framework::OperatorWithKernel {
size_t N = x_dims.size();
PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
auto in_dim = x_dims[0];
for (size_t i = 1; i < N; i++) {
auto dim = x_dims[i];
PADDLE_ENFORCE_EQ(in_dim, dim, "Input tensors must have same shape");
framework::DDim in_dim({0});
for (auto& x_dim : x_dims) {
if (framework::product(x_dim) == 0) {
continue;
}
if (framework::product(in_dim) == 0) {
in_dim = x_dim;
} else {
PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape");
}
}
ctx->SetOutputDim("Out", in_dim);
ctx->ShareLoD("X", /*->*/ "Out");
......@@ -51,9 +57,23 @@ class SumOp : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override {
auto x_vars = ctx.MultiInputVar("X");
if (x_vars[0]->IsType<framework::LoDTensor>()) {
return framework::OpKernelType(
framework::ToDataType(x_vars[0]->Get<framework::LoDTensor>().type()),
ctx.device_context());
int dtype = -1;
for (auto& x_var : x_vars) {
auto& lod_tensor = x_var->Get<framework::LoDTensor>();
if (lod_tensor.numel() == 0) {
continue;
}
if (dtype == -1) {
dtype = framework::ToDataType(lod_tensor.type());
} else {
PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type()));
}
}
PADDLE_ENFORCE_NE(dtype, -1,
"Sum operator should have at least one tensor");
return framework::OpKernelType(static_cast<framework::DataType>(dtype),
ctx.device_context());
} else if (x_vars[0]->IsType<framework::SelectedRows>()) {
return framework::OpKernelType(
framework::ToDataType(
......
......@@ -53,6 +53,9 @@ class SumKernel : public framework::OpKernel<T> {
for (int i = in_place ? 1 : 0; i < N; i++) {
if (in_vars[i]->IsType<framework::LoDTensor>()) {
auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
if (in_t.numel() == 0) {
continue;
}
auto in = EigenVector<T>::Flatten(in_t);
result.device(place) = result + in;
} else if (in_vars[i]->IsType<framework::SelectedRows>()) {
......@@ -84,7 +87,7 @@ class SumKernel : public framework::OpKernel<T> {
int64_t offset = 0;
for (int i = 0; i < N; i++) {
PADDLE_ENFORCE_EQ(out->height(),
in_vars[i]->Get<SelectedRows>().height())
in_vars[i]->Get<SelectedRows>().height());
functor(context.device_context(), in_vars[i]->Get<SelectedRows>(),
offset, out);
offset += in_vars[i]->Get<SelectedRows>().value().numel();
......
......@@ -27,7 +27,7 @@ class WriteToArrayOp : public ArrayOp {
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {
auto *x = scope.FindVar(Input("X"));
PADDLE_ENFORCE(x != nullptr, "X must be set");
if (x == nullptr) return;
auto &x_tensor = x->Get<framework::LoDTensor>();
size_t offset = GetOffset(scope, dev_ctx);
auto *out =
......@@ -37,9 +37,15 @@ class WriteToArrayOp : public ArrayOp {
<< " to " << offset + 1;
out->resize(offset + 1);
}
auto *out_tensor = &out->at(offset);
CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor);
out_tensor->set_lod(x_tensor.lod());
if (x_tensor.memory_size() > 0) {
auto *out_tensor = &out->at(offset);
CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor);
out_tensor->set_lod(x_tensor.lod());
} else {
VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
"nothing has been written to output array["
<< offset << "].";
}
}
};
......@@ -70,7 +76,9 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
"The number of element of subscript index must be 1");
PADDLE_ENFORCE(context->HasInput("X"), NotHasXError());
if (!context->HasInput("X")) {
return;
}
PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError());
context->SetOutputDim("Out", context->GetInputDim("X"));
}
......@@ -93,9 +101,10 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name),
"Cannot found %s", out_name);
out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
auto &x =
detail::Ref(block->FindVarRecursive(x_name), "Cannot found %s", x_name);
out.SetDataType(x.GetDataType());
auto *x = block->FindVarRecursive(x_name);
if (x != nullptr) {
out.SetDataType(x->GetDataType());
}
}
};
......@@ -115,10 +124,13 @@ class ReadFromArrayOp : public ArrayOp {
PADDLE_ENFORCE(out != nullptr, "Out must be set");
auto *out_tensor = out->GetMutable<framework::LoDTensor>();
size_t offset = GetOffset(scope, dev_ctx);
PADDLE_ENFORCE_LT(offset, x_array.size());
framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx,
out_tensor);
out_tensor->set_lod(x_array[offset].lod());
if (offset < x_array.size()) {
framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx,
out_tensor);
out_tensor->set_lod(x_array[offset].lod());
} else {
VLOG(10) << "offset " << offset << " >= " << x_array.size();
}
}
};
......
......@@ -98,8 +98,6 @@ class WhileGradOp : public framework::OperatorBase {
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {
// PADDLE_ENFORCE(...)
framework::Executor executor(dev_ctx);
auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
auto *program = block->Program();
......@@ -124,8 +122,12 @@ class WhileGradOp : public framework::OperatorBase {
auto inside_og_name = inside_og_names[i];
VLOG(10) << "Linking outside " << outside_og_name << " --> inside "
<< inside_og_name;
auto &og_outside = detail::Ref(scope.FindVar(outside_og_name));
auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name));
auto &og_outside =
detail::Ref(scope.FindVar(outside_og_name),
"Cannot find Outside Gradient %s", outside_og_name);
auto &og_inside =
detail::Ref(cur_scope.Var(inside_og_name),
"Cannot find inside gradient %s", inside_og_name);
if (og_outside.Type().hash_code() ==
typeid(framework::LoDTensor).hash_code()) {
auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
......@@ -160,7 +162,7 @@ class WhileGradOp : public framework::OperatorBase {
PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
if (pg_names[param_id] == framework::kEmptyVarName) {
continue; // iterator doesn't have gradient
continue; // parameter doesn't have gradient
}
auto inside_grad_name = framework::GradVarName(p_names[param_id]);
......@@ -190,7 +192,6 @@ class WhileGradOp : public framework::OperatorBase {
}
}
// sum gradient
auto new_inside_name = cur_scope.Rename(inside_grad_name);
auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {pg_names[param_id], new_inside_name}}},
......@@ -207,18 +208,35 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
virtual std::unique_ptr<framework::OpDescBind> Apply() const {
std::unique_ptr<framework::OpDescBind> Apply() const override {
auto *grad = new framework::OpDescBind();
grad->SetType("while_grad");
grad->SetInput(kParameters, Input(kParameters));
grad->SetOutput(
framework::GradVarName(kParameters),
InputGrad(kParameters, /*do not drop empty gradient*/ false));
// Not all of IGs will be generated by inner gradient operators of while op.
// Ignore IGs that is not generated by the inside block.
auto igs = InputGrad(kParameters, /*do not drop empty gradient*/ false);
std::unordered_set<std::string> all_outs;
for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) {
all_outs.insert(oname);
}
}
for (auto &each_ig : igs) {
if (all_outs.find(each_ig) == all_outs.end()) {
VLOG(10) << "Ignore " << each_ig;
each_ig = framework::kEmptyVarName;
}
}
grad->SetOutput(framework::GradVarName(kParameters), igs);
grad->SetInput(kOutputs, Output(kOutputs));
// OG should be re-calculated by step blocks, since many outputs of while op
// do not need to calculate gradients.
std::unordered_set<std::string> block_ins;
auto *fwd_block = this->grad_block_[0]->ParentBlock();
{
for (auto &p : Input(kParameters)) {
block_ins.insert(p);
......@@ -233,6 +251,13 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
if (block_ins.find(input_name) != block_ins.end()) {
continue;
}
// If the input of Op is generated by the forward block, do not make it
// as input again.
if (fwd_block->FindVar(input_name) != nullptr) {
continue;
}
extra_inputs.insert(input_name);
}
......@@ -287,7 +312,6 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
auto p_names = ctx->Inputs(kParameters);
auto pg_names = ctx->Outputs(kParamGrads);
auto dims = ctx->GetInputsDim(kParameters);
auto var_types = ctx->GetInputsVarType(kParameters);
std::vector<std::string> names_to_set;
std::vector<framework::DDim> dims_to_set;
......@@ -295,13 +319,14 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
if (pg_names[i] == framework::kEmptyVarName) {
continue;
}
auto dims = ctx->GetInputsElementDim(kParameters, i);
if (var_types[i] == framework::VarDesc::LOD_TENSOR) {
names_to_set.push_back(pg_names[i]);
dims_to_set.push_back(dims[i]);
dims_to_set.push_back(dims);
} else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) {
// not sure how to set the dim of LOD_TENSOR_ARRAY
names_to_set.push_back(pg_names[i]);
dims_to_set.push_back(dims[i]);
dims_to_set.push_back(dims);
}
}
ctx->SetDims(names_to_set, dims_to_set);
......
......@@ -127,8 +127,3 @@ TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -46,8 +46,3 @@ TEST(TensorToProto, Case2) {
EXPECT_EQ(t1[i], t[i]);
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -234,16 +234,24 @@ inline void throw_on_error(T e) {
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
__PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \
PADDLE_ENFORCE(nullptr != (__VAL), #__VAL " should not be null\n%s", \
paddle::string::Sprintf("" __VA_ARGS__));
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
PADDLE_ENFORCE(__VAL0 __CMP __VAL1, \
"enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \
#__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
paddle::string::to_string(__VAL1), \
paddle::string::Sprintf("" __VA_ARGS__));
#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \
do { \
if (UNLIKELY(nullptr == (__VAL))) { \
PADDLE_THROW(#__VAL " should not be null\n%s", \
paddle::string::Sprintf("" __VA_ARGS__)); \
} \
} while (0)
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
do { \
if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) { \
PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP \
" %s\n%s", \
#__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
paddle::string::to_string(__VAL1), \
paddle::string::Sprintf("" __VA_ARGS__)); \
} \
} while (0)
} // namespace platform
} // namespace paddle
......@@ -75,15 +75,19 @@ size_t GpuMaxChunkSize() {
GpuMemoryUsage(available, total);
// Reserving the rest memory for page tables, etc.
size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
size_t reserving = 0.05 * total;
// If available less than minimum chunk size, no usable memory exists.
available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();
available =
std::max(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
reserving) -
reserving;
// If available less than reserving, no usable memory exists.
size_t usable = std::max(available, reserving) - reserving;
size_t allocating = FLAGS_fraction_of_gpu_memory_to_use * total;
return usable;
PADDLE_ENFORCE_LT(allocating, available);
return allocating;
}
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
......
......@@ -49,7 +49,7 @@ if(WITH_TESTING)
add_subdirectory(test)
endif()
if(NOT WITH_C_API)
if(NOT MOBILE_INFERENCE)
add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES})
link_paddle_exe(paddle_pserver_main)
......
......@@ -5,4 +5,6 @@ if(WITH_PYTHON)
${GLOB_OP_LIB})
endif(WITH_PYTHON)
cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
if(WITH_DOC)
cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
endif(WITH_DOC)
......@@ -36,6 +36,7 @@ function cmake_gen() {
${PYTHON_FLAGS}
-DWITH_DOC=OFF
-DWITH_GPU=${WITH_GPU:-OFF}
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
-DWITH_MKL=${WITH_MKL:-ON}
-DWITH_AVX=${WITH_AVX:-OFF}
-DWITH_GOLANG=${WITH_GOLANG:-ON}
......@@ -57,6 +58,7 @@ EOF
${PYTHON_FLAGS} \
-DWITH_DOC=OFF \
-DWITH_GPU=${WITH_GPU:-OFF} \
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
-DWITH_MKL=${WITH_MKL:-ON} \
-DWITH_AVX=${WITH_AVX:-OFF} \
-DWITH_GOLANG=${WITH_GOLANG:-ON} \
......@@ -183,6 +185,14 @@ EOF
${DOCKERFILE_GPU_ENV}
ADD go/cmd/pserver/pserver /usr/bin/
ADD go/cmd/master/master /usr/bin/
EOF
if [[ ${WITH_DOC:-OFF} == 'ON' ]]; then
cat >> /paddle/build/Dockerfile <<EOF
ADD paddle/pybind/print_operators_doc /usr/bin/
EOF
fi
cat >> /paddle/build/Dockerfile <<EOF
# default command shows the paddle version and exit
CMD ["paddle", "version"]
EOF
......
......@@ -5,4 +5,8 @@ if(WITH_TESTING)
add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
add_library(paddle_test_util STATIC TestUtil.cpp)
add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
if(NOT MOBILE_INFERENCE)
add_library(paddle_gtest_main STATIC paddle_gtest_main.cc)
add_dependencies(paddle_gtest_main paddle_memory gtest gflags)
endif()
endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstring>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/memory/memory.h"
int main(int argc, char** argv) {
std::vector<char*> new_argv;
std::string gflags_env;
new_argv.push_back(argv[0]);
#ifdef PADDLE_WITH_CUDA
new_argv.push_back(
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
#else
new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
#endif
int new_argc = static_cast<int>(new_argv.size());
char** new_argv_address = new_argv.data();
google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
testing::InitGoogleTest(&argc, argv);
paddle::memory::Used(paddle::platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
paddle::memory::Used(paddle::platform::GPUPlace(0));
#endif
return RUN_ALL_TESTS();
}
......@@ -54,7 +54,7 @@ if(WITH_TESTING)
add_subdirectory(tests)
endif()
if(NOT WITH_C_API)
if(NOT MOBILE_INFERENCE)
add_paddle_exe(paddle_trainer TrainerMain.cpp)
add_paddle_exe(paddle_merge_model MergeModel.cpp)
......@@ -74,7 +74,5 @@ endif()
if(WITH_GOLANG)
add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
target_link_libraries(paddle_trainer_lib paddle_pserver_cclient)
if(NOT WITH_C_API)
target_link_libraries(paddle_trainer paddle_pserver_cclient)
endif()
target_link_libraries(paddle_trainer paddle_pserver_cclient)
endif(WITH_GOLANG)
此差异已折叠。
......@@ -14,20 +14,21 @@ import optimizer
import backward
import regularizer
from param_attr import ParamAttr
from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, GPUPlace
Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + [
'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
'DataFeeder'
]
def __read_gflags_from_env__():
"""
Enable reading gflags from environment variables.
Returns:
None
"""
......@@ -36,7 +37,8 @@ def __read_gflags_from_env__():
read_env_flags = ['use_pinned_memory']
if core.is_compile_gpu():
read_env_flags.append('fraction_of_gpu_memory_to_use')
core.init_gflags(sys.argv + ["--tryfromenv=" + ",".join(read_env_flags)])
core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)])
__read_gflags_from_env__()
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册