提交 c8c4b6e4 编写于 作者: W wwhu

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into clip_by_norm

update
...@@ -30,6 +30,7 @@ addons: ...@@ -30,6 +30,7 @@ addons:
- automake - automake
- libtool - libtool
- ccache - ccache
ssh_known_hosts: 52.76.173.135
before_install: before_install:
- if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
# Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
...@@ -42,6 +43,14 @@ script: ...@@ -42,6 +43,14 @@ script:
- | - |
timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi; RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
- |
if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
export DOCS_DIR=`pwd`
cd ..
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc
notifications: notifications:
email: email:
on_success: change on_success: change
......
...@@ -126,7 +126,7 @@ include(external/swig) # download, build, install swig ...@@ -126,7 +126,7 @@ include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(external/any) # download libn::any include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
include(external/nccl) include(external/nccl)
include(cudnn) # set cudnn libraries, must before configure include(cudnn) # set cudnn libraries, must before configure
......
...@@ -23,7 +23,7 @@ On each machine, we will test and compare the performance of training on single ...@@ -23,7 +23,7 @@ On each machine, we will test and compare the performance of training on single
## Benchmark Model ## Benchmark Model
### Server ### Server
Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
Input image size - 3 * 224 * 224, Time: images/second Input image size - 3 * 224 * 224, Time: images/second
......
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 224
width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 64)
layer_num = get_config_arg("layer_num", int, 50)
is_test = get_config_arg("is_test", bool, False)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
#######################Network Configuration #############
def conv_bn_layer(name,
input,
filter_size,
num_filters,
stride,
padding,
channels=None,
active_type=ReluActivation()):
"""
A wrapper for conv layer with batch normalization layers.
Note:
conv layer has no activation.
"""
tmp = img_conv_layer(
name=name + "_conv",
input=input,
filter_size=filter_size,
num_channels=channels,
num_filters=num_filters,
stride=stride,
padding=padding,
act=LinearActivation(),
bias_attr=False)
return batch_norm_layer(
name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
def bottleneck_block(name, input, num_filters1, num_filters2):
"""
A wrapper for bottlenect building block in ResNet.
Last conv_bn_layer has no activation.
Addto layer has activation of relu.
"""
last_name = conv_bn_layer(
name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=1,
padding=0)
last_name = conv_bn_layer(
name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(
name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(
name=name + "_addto", input=[input, last_name], act=ReluActivation())
def mid_projection(name, input, num_filters1, num_filters2, stride=2):
"""
A wrapper for middile projection in ResNet.
projection shortcuts are used for increasing dimensions,
and other shortcuts are identity
branch1: projection shortcuts are used for increasing
dimensions, has no activation.
branch2x: bottleneck building block, shortcuts are identity.
"""
# stride = 2
branch1 = conv_bn_layer(
name=name + '_branch1',
input=input,
filter_size=1,
num_filters=num_filters2,
stride=stride,
padding=0,
active_type=LinearActivation())
last_name = conv_bn_layer(
name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=stride,
padding=0)
last_name = conv_bn_layer(
name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(
name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(
name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
img = data_layer(name='image', size=height * width * 3)
def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
"""
A wrapper for 50,101,152 layers of ResNet.
res2_num: number of blocks stacked in conv2_x
res3_num: number of blocks stacked in conv3_x
res4_num: number of blocks stacked in conv4_x
res5_num: number of blocks stacked in conv5_x
"""
# For ImageNet
# conv1: 112x112
tmp = conv_bn_layer(
"conv1",
input=img,
filter_size=7,
channels=3,
num_filters=64,
stride=2,
padding=3)
tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
# conv2_x: 56x56
tmp = mid_projection(
name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
for i in xrange(2, res2_num + 1, 1):
tmp = bottleneck_block(
name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
# conv3_x: 28x28
tmp = mid_projection(
name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
for i in xrange(2, res3_num + 1, 1):
tmp = bottleneck_block(
name="res3_" + str(i),
input=tmp,
num_filters1=128,
num_filters2=512)
# conv4_x: 14x14
tmp = mid_projection(
name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
for i in xrange(2, res4_num + 1, 1):
tmp = bottleneck_block(
name="res4_" + str(i),
input=tmp,
num_filters1=256,
num_filters2=1024)
# conv5_x: 7x7
tmp = mid_projection(
name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
for i in xrange(2, res5_num + 1, 1):
tmp = bottleneck_block(
name="res5_" + str(i),
input=tmp,
num_filters1=512,
num_filters2=2048)
tmp = img_pool_layer(
name='avgpool',
input=tmp,
pool_size=7,
stride=1,
pool_type=AvgPooling())
return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
if layer_num == 50:
resnet = deep_res_net(3, 4, 6, 3)
elif layer_num == 101:
resnet = deep_res_net(3, 4, 23, 3)
elif layer_num == 152:
resnet = deep_res_net(3, 8, 36, 3)
else:
print("Wrong layer number.")
lbl = data_layer(name="label", size=num_class)
loss = cross_entropy(name='loss', input=resnet, label=lbl)
inputs(img, lbl)
outputs(loss)
...@@ -5,22 +5,23 @@ function train() { ...@@ -5,22 +5,23 @@ function train() {
export OMP_DYNAMIC="FALSE" export OMP_DYNAMIC="FALSE"
export KMP_AFFINITY="granularity=fine,compact,0,0" export KMP_AFFINITY="granularity=fine,compact,0,0"
topology=$1 topology=$1
bs=$2 layer_num=$2
use_mkldnn=$3 bs=$3
if [ $3 == "True" ]; then use_mkldnn=$4
if [ $4 == "True" ]; then
thread=1 thread=1
log="logs/${topology}-mkldnn-${bs}.log" log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
elif [ $3 == "False" ]; then elif [ $4 == "False" ]; then
thread=`nproc` thread=`nproc`
# each trainer_count use only 1 core to avoid conflict # each trainer_count use only 1 core to avoid conflict
export OMP_NUM_THREADS=1 export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=1 export MKL_NUM_THREADS=1
log="logs/${topology}-${thread}mklml-${bs}.log" log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
else else
echo "Wrong input $3, use True or False." echo "Wrong input $3, use True or False."
exit 0 exit 0
fi fi
args="batch_size=${bs}" args="batch_size=${bs},layer_num=${layer_num}"
config="${topology}.py" config="${topology}.py"
paddle train --job=time \ paddle train --job=time \
--config=$config \ --config=$config \
...@@ -40,12 +41,9 @@ if [ ! -d "logs" ]; then ...@@ -40,12 +41,9 @@ if [ ! -d "logs" ]; then
mkdir logs mkdir logs
fi fi
#========== mkldnn ==========# for use_mkldnn in True False; do
train vgg 64 True for batchsize in 64 128 256; do
train vgg 128 True train vgg 19 $batchsize $use_mkldnn
train vgg 256 True train resnet 50 $batchsize $use_mkldnn
done
#========== mklml ===========# done
train vgg 64 False
train vgg 128 False
train vgg 256 False
...@@ -79,9 +79,8 @@ if(NOT DEFINED IOS_ARCH) ...@@ -79,9 +79,8 @@ if(NOT DEFINED IOS_ARCH)
# FIXME(liuyiqun): support "armv7;armv7s;arm64" future # FIXME(liuyiqun): support "armv7;armv7s;arm64" future
set(IOS_ARCH "arm64") set(IOS_ARCH "arm64")
elseif(IOS_PLATFORM STREQUAL "SIMULATOR") elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
set(IOS_ARCH "i386;x86_64") # FIXME(liuyiqun): support "i386;x86_64" future
elseif(IOS_PLATFORM STREQUAL "WATCHOS") set(IOS_ARCH "x86_64")
set(IOS_ARCH armv7k)
endif() endif()
endif() endif()
set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT WITH_GPU)
return()
endif()
include(ExternalProject) include(ExternalProject)
set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl) set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
INCLUDE(ExternalProject) # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind) if(NOT WITH_PYTHON)
return()
endif()
include(ExternalProject)
INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include) set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
ExternalProject_Add( ExternalProject_Add(
extern_pybind extern_pybind
...@@ -17,14 +35,12 @@ ExternalProject_Add( ...@@ -17,14 +35,12 @@ ExternalProject_Add(
TEST_COMMAND "" TEST_COMMAND ""
) )
if (${CMAKE_VERSION} VERSION_LESS "3.3.0") if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c) set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";") file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";")
add_library(pybind STATIC ${dummyfile}) add_library(pybind STATIC ${dummyfile})
else() else()
add_library(pybind INTERFACE) add_library(pybind INTERFACE)
endif() endif()
add_dependencies(pybind extern_pybind) add_dependencies(pybind extern_pybind)
LIST(APPEND external_project_dependencies pybind)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
# This file is use to check all support level of AVX on your machine # This file is use to check all support level of AVX on your machine
# so that PaddlePaddle can unleash the vectorization power of muticore. # so that PaddlePaddle can unleash the vectorization power of muticore.
INCLUDE(CheckCXXSourceRuns) include(CheckCXXSourceRuns)
INCLUDE(CheckCXXSourceCompiles) include(CheckCXXSourceCompiles)
IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(MMX_FLAG "-mmmx") set(MMX_FLAG "-mmmx")
set(SSE2_FLAG "-msse2") set(SSE2_FLAG "-msse2")
set(SSE3_FLAG "-msse3") set(SSE3_FLAG "-msse3")
SET(AVX_FLAG "-mavx") set(AVX_FLAG "-mavx")
SET(AVX2_FLAG "-mavx2") set(AVX2_FLAG "-mavx2")
ELSEIF(MSVC) elseif(MSVC)
set(MMX_FLAG "/arch:MMX") set(MMX_FLAG "/arch:MMX")
set(SSE2_FLAG "/arch:SSE2") set(SSE2_FLAG "/arch:SSE2")
set(SSE3_FLAG "/arch:SSE3") set(SSE3_FLAG "/arch:SSE3")
SET(AVX_FLAG "/arch:AVX") SET(AVX_FLAG "/arch:AVX")
SET(AVX2_FLAG "/arch:AVX2") SET(AVX2_FLAG "/arch:AVX2")
ENDIF() endif()
set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
# Check MMX # Check MMX
set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG}) set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <mmintrin.h> #include <mmintrin.h>
int main() int main()
...@@ -32,6 +33,7 @@ int main() ...@@ -32,6 +33,7 @@ int main()
# Check SSE2 # Check SSE2
set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG}) set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <emmintrin.h> #include <emmintrin.h>
int main() int main()
...@@ -42,6 +44,7 @@ int main() ...@@ -42,6 +44,7 @@ int main()
# Check SSE3 # Check SSE3
set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG}) set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <pmmintrin.h> #include <pmmintrin.h>
int main() int main()
...@@ -55,6 +58,7 @@ int main() ...@@ -55,6 +58,7 @@ int main()
# Check AVX # Check AVX
set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h> #include <immintrin.h>
int main() int main()
...@@ -67,6 +71,7 @@ int main() ...@@ -67,6 +71,7 @@ int main()
# Check AVX 2 # Check AVX 2
set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
CHECK_CXX_SOURCE_RUNS(" CHECK_CXX_SOURCE_RUNS("
#include <immintrin.h> #include <immintrin.h>
int main() int main()
......
# Design Doc: float16
## Why float16
Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range.
When high precision computation is not required, using float16 data type could potentially
- reduce storage space, memory bandwidth, and power usages;
- increase the chance of data fitting into a smaller cache of lower latency;
- provide arithmetic speed up if supported by hardware.
## Survey of current float16 support
A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info.
The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernel. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier.
### Compiler
- nvcc supports `__half` data type after CUDA 7.5.
- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4.
- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9.
### Hardware
- `__half` is supported on GPU with compute capability >= 5.3.
- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above.
- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018).
### Libraries
- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
## Implementation
The float16 class holds a 16-bit `uint16_t` data internally.
```
struct float16 {
uint16_t x;
};
```
float16 supports the following features:
- constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double.
- constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen.
- conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen.
- overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware.
To support the above features, two fundamental conversion functions are provided:
```
float16 float_to_half_rn(float f); // convert to half precision in round-to-nearest-even mode
float half_to_float(float16 h);
```
which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion.
## To do
After float16 class is available, some of the future items are below:
- Update pybind/tensor_py.h to bind c++ float16 with numpy float16.
- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16.
- Create a type-casting operator that can convert the data type in tensor between float16 and other types.
# Averaging Parameter in PaddlePaddle
## Why Averaging
In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can.
Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="./images/theta_star.gif"/><br/> . The averaging is done as follows:
<img src="./images/asgd.gif" align="center"/><br/>
We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
### How to perform Parameter Averaging in PaddlePaddle
Parameter Averaging in PaddlePaddle works in the following way during training :
1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer
2. The optimizer itself is responsible for updating the parameters.
3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches.
2. However, saving all the N instances of the parameters in memory is not feasible.
3. Therefore, an approximation algorithm is used.
Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
During the testing/ saving the model phase, we perform the following steps:
1. Perform the delayed operations.
2. Save current values of the parameters to a temporary variable.
3. Replace the values of the parameters with the averaged values.
4. Perform testing and/or save the parameters.
5. Restore the values of the parameters once done.
### How to implement Averaging of Parameter in PaddlePaddle
We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training.
**Advantages**:
- Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op.
- Makes it easy for the users to customize and extend the framework.
**Disadvantages**:
- Implementation requires re-writing the averaging methodology in Python.
### Low-Level implementation
In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input:
- the optimizer
- the window_size to keep the updates
The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
### Python API implementation for ParameterAverageOptimizer
Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following:
- Any optimizer (RMSProp , AdaGrad etc.)
- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.cc)
#### Creation of the ParameterAverageOptimizer operator
There are two ways for creating the ParameterAverageOptimizer op:
1. We create the op immediately while building the computation graph.
2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added.
The proposal is to add the op immediately while building the computation graph.
#### High-level API
In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
...@@ -75,7 +75,7 @@ PaddlePaddle目前支持8种learning_rate_schedule,这8种learning_rate_schedu ...@@ -75,7 +75,7 @@ PaddlePaddle目前支持8种learning_rate_schedule,这8种learning_rate_schedu
optimizer = paddle.optimizer.Adam( optimizer = paddle.optimizer.Adam(
learning_rate=1e-3, learning_rate=1e-3,
learning_rate_schedule="manual", learning_rate_schedule="pass_manual",
learning_rate_args="1:1.0,2:0.9,3:0.8",) learning_rate_args="1:1.0,2:0.9,3:0.8",)
在该示例中,当已训练pass数小于等于1时,学习率为 :code:`1e-3 * 1.0`;当已训练pass数大于1小于等于2时,学习率为 :code:`1e-3 * 0.9`;当已训练pass数大于2时,学习率为 :code:`1e-3 * 0.8`。 在该示例中,当已训练pass数小于等于1时,学习率为 :code:`1e-3 * 1.0`;当已训练pass数大于1小于等于2时,学习率为 :code:`1e-3 * 0.9`;当已训练pass数大于2时,学习率为 :code:`1e-3 * 0.8`。
......
...@@ -145,7 +145,7 @@ PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以 ...@@ -145,7 +145,7 @@ PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以
Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。 Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。 如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。
我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行: 我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行:
......
...@@ -63,7 +63,7 @@ ...@@ -63,7 +63,7 @@
</tr> </tr>
<tr> <tr>
<td class="left" rowspan="15">训练</td><td class="left">dot_period</td> <td class="left" rowspan="14">训练</td><td class="left">dot_period</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td> <td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr> </tr>
......
...@@ -8,3 +8,4 @@ PaddlePaddle 文档 ...@@ -8,3 +8,4 @@ PaddlePaddle 文档
howto/index_cn.rst howto/index_cn.rst
api/index_cn.rst api/index_cn.rst
faq/index_cn.rst faq/index_cn.rst
mobile/index_cn.rst
...@@ -7,3 +7,4 @@ PaddlePaddle Documentation ...@@ -7,3 +7,4 @@ PaddlePaddle Documentation
getstarted/index_en.rst getstarted/index_en.rst
howto/index_en.rst howto/index_en.rst
api/index_en.rst api/index_en.rst
mobile/index_en.rst
# 构建Android平台上的PaddlePaddle库 # 构建Android平台上的PaddlePaddle库
用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库: 用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库:
- 基于Docker容器的编译方式 - 基于Docker容器的编译方式
- 基于Linux交叉编译环境的编译方式 - 基于Linux交叉编译环境的编译方式
## 基于Docker容器的编译方式 ## 基于Docker容器的编译方式
...@@ -20,20 +20,42 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android ...@@ -20,20 +20,42 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android
构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。 构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。
Android的Docker开发镜像向用户提供两个可配置的参数: Android的Docker开发镜像向用户提供两个可配置的参数:
| Argument | Optional Values | Default | <table class="docutils">
|-----------------|-------------------------|---------| <colgroup>
|`ANDROID_ABI` |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` | <col width="25%" />
|`ANDROID_API` |`>= 21` | `21` | <col width="50%" />
<col width="25%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd">
<th class="head">Argument</th>
<th class="head">Optional Values</th>
<th class="head">Default</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even">
<td>ANDROID_ABI</td>
<td>armeabi-v7a, arm64-v8a</td>
<td>armeabi-v7a</td>
</tr>
<tr class="row-odd">
<td>ANDROID_API</td>
<td>>= 21</td>
<td>21</td>
</tr>
</tbody>
</table>
- 编译`armeabi-v7a``Android API 21`的PaddlePaddle库 - 编译`armeabi-v7a``Android API 21`的PaddlePaddle库
```bash ```bash
$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
``` ```
- 编译`arm64-v8a``Android API 21`的PaddlePaddle库 - 编译`arm64-v8a``Android API 21`的PaddlePaddle库
```bash ```bash
$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
``` ```
执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI``ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a``ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。 执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI``ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a``ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
...@@ -82,16 +104,16 @@ CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cm ...@@ -82,16 +104,16 @@ CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cm
Android平台可选配置参数: Android平台可选配置参数:
- `ANDROID_STANDALONE_TOOLCHAIN`,独立工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别;否则,用户需要在cmake时手动设置这些值。无默认值。 - `ANDROID_STANDALONE_TOOLCHAIN`,独立工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别;否则,用户需要在cmake时手动设置这些值。无默认值。
- `ANDROID_TOOLCHAIN`,目标工具链。可设置`gcc/clang`,默认值为`clang` - `ANDROID_TOOLCHAIN`,目标工具链。可设置`gcc/clang`,默认值为`clang`
- CMake 3.7以上,将会始终使用`clang`工具链;CMake 3.7以下,可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。 - CMake 3.7以上,将会始终使用`clang`工具链;CMake 3.7以下,可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。
- Android官方提供的`clang`编译器要求系统支持`GLIBC 2.15`以上。 - Android官方提供的`clang`编译器要求系统支持`GLIBC 2.15`以上。
- `ANDROID_ABI`,目标架构ABI。目前支持`armeabi-v7a``arm64-v8a`,默认值为`armeabi-v7a` - `ANDROID_ABI`,目标架构ABI。目前支持`armeabi-v7a``arm64-v8a`,默认值为`armeabi-v7a`
- `ANDROID_NATIVE_API_LEVEL`,工具链的Android API级别。若没有显式设置,PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。 - `ANDROID_NATIVE_API_LEVEL`,工具链的Android API级别。若没有显式设置,PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。
- `ANROID_ARM_MODE`,是否使用ARM模式。 - `ANROID_ARM_MODE`,是否使用ARM模式。
- `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON` - `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`
- `ANDROID_ABI=arm64-v8a`时,不需要设置。 - `ANDROID_ABI=arm64-v8a`时,不需要设置。
- `ANDROID_ARM_NEON`,是否使用NEON指令。 - `ANDROID_ARM_NEON`,是否使用NEON指令。
- `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON` - `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`
- `ANDROID_ABI=arm64-v8a`时,不需要设置。 - `ANDROID_ABI=arm64-v8a`时,不需要设置。
其他配置参数: 其他配置参数:
...@@ -119,7 +141,7 @@ cmake -DCMAKE_SYSTEM_NAME=Android \ ...@@ -119,7 +141,7 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \ -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
-DANDROID_ABI=arm64-v8a \ -DANDROID_ABI=arm64-v8a \
-DUSE_EIGEN_FOR_BLAS=OFF \ -DUSE_EIGEN_FOR_BLAS=OFF \
-DCMAKE_INSTALL_PREFIX=your/path/to/install \ -DCMAKE_INSTALL_PREFIX=your/path/to/install \
-DWITH_C_API=ON \ -DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \ -DWITH_SWIG_PY=OFF \
.. ..
...@@ -128,8 +150,8 @@ cmake -DCMAKE_SYSTEM_NAME=Android \ ...@@ -128,8 +150,8 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE``MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE``Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。 用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE``MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE``Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议: **性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议:
- 设置`CMAKE_BUILD_TYPE``Release` - 设置`CMAKE_BUILD_TYPE``Release`
- 使用`clang`编译工具链 - 使用`clang`编译工具链
- `armeabi-v7a`时,设置`USE_EIGEN_BLAS=ON`,使用Eigen进行矩阵计算;`arm64-v8a`时,设置`USE_EIGEN_FOR_BLAS=OFF`,使用OpenBLAS进行矩阵计算 - `armeabi-v7a`时,设置`USE_EIGEN_BLAS=ON`,使用Eigen进行矩阵计算;`arm64-v8a`时,设置`USE_EIGEN_FOR_BLAS=OFF`,使用OpenBLAS进行矩阵计算
### 编译和安装 ### 编译和安装
......
# Build PaddlePaddle for Android
There are two approaches to build PaddlePaddle for Android: using Docker and on Linux without Docker.
## Cross-Compiling Using Docker
Docker-based cross-compiling is the recommended approach because Docker runs on all major operating systems, including Linux, Mac OS X, and Windows.
### Build the Docker Image
The following steps pack all the tools that we need to build PaddlePaddle into a Docker image.
```bash
$ git clone https://github.com/PaddlePaddle/Paddle.git
$ cd Paddle
$ docker build -t paddle:dev-android . -f Dockerfile.android
```
### Build the Inference Library
We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
```bash
$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android
```
The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
<table class="docutils">
<colgroup>
<col width="25%" />
<col width="50%" />
<col width="25%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd">
<th class="head">Argument</th>
<th class="head">Optional Values</th>
<th class="head">Default</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even">
<td>ANDROID_ABI</td>
<td>armeabi-v7a, arm64-v8a</td>
<td>armeabi-v7a</td>
</tr>
<tr class="row-odd">
<td>ANDROID_API</td>
<td>>= 21</td>
<td>21</td>
</tr>
</tbody>
</table>
The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`. For information about other configuration arguments, please continue reading.
The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`.
## Cross-Compiling on Linux
The Linux-base approach to cross-compile is to run steps in `Dockerfile.android` manually on a Linux x64 computer.
### Setup the Environment
To build for Android's, we need [Android NDK](
https://developer.android.com/ndk/downloads/index.html):
```bash
wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
unzip -q android-ndk-r14b-linux-x86_64.zip
```
Android NDK includes everything we need to build the [*standalone toolchain*](https://developer.android.com/ndk/guides/standalone_toolchain.html), which in then used to build PaddlePaddle for Android. (We plan to remove the intermediate stage of building the standalone toolchain in the near future.)
- To build the standalone toolchain for `armeabi-v7a` and Android API level 21:
```bash
your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
--arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
```
The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`.
- To build the standalone toolchain for `arm64-v8a` and Android API level 21:
```bash
your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
--arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
```
The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
**Please be aware that the minimum level of Android API required by PaddlePaddle is 21.**
### Cross-Compiling Arguments
CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake. `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling).
Some other CMake arguments you need to know:
- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, and `WITH_RDMA=OFF`.
- `WITH_C_API` must be `ON`, to build the C-based inference library for Android.
- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API.
Some Android-specific arguments:
- `ANDROID_STANDALONE_TOOLCHAIN`: the absolute path of the Android standalone toolchain, or the path relative to the CMake build directory. PaddlePaddle's CMake extensions would derive the cross-compiler, sysroot and Android API level from this argument.
- `ANDROID_TOOLCHAIN`: could be `gcc` or `clang`. The default value is `clang`.
- For CMake >= 3.7, it should anyway be `clang`. For older versions, it could be `gcc`.
- Android's official `clang` requires `glibc` >= 2.15.
- `ANDROID_ABI`: could be `armeabi-v7a` or `arm64-v8a`. The default value is `armeabi-v7a`.
- `ANDROID_NATIVE_API_LEVEL`: could be derived from the value of `ANDROID_STANDALONE_TOOLCHAIN`.
- `ANROID_ARM_MODE`:
- could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`;
- no need to specify when `ANDROID_ABI=arm64-v8a`.
- `ANDROID_ARM_NEON`: indicates if to use NEON instructions.
- could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`;
- no need to specify when `ANDROID_ABI=arm64-v8a`.
Other useful arguments:
- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen. Could be `ON` or `OFF`, defaults to `OFF`.
- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC`, or `cc`.
Some frequent configurations for your reference:
```bash
cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \
-DANDROID_ABI=armeabi-v7a \
-DANDROID_ARM_NEON=ON \
-DANDROID_ARM_MODE=ON \
-DUSE_EIGEN_FOR_BLAS=ON \
-DCMAKE_INSTALL_PREFIX=your/path/to/install \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
..
```
```
cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
-DANDROID_ABI=arm64-v8a \
-DUSE_EIGEN_FOR_BLAS=OFF \
-DCMAKE_INSTALL_PREFIX=your/path/to/install \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
..
```
There are some other arguments you might want to configure.
- `CMAKE_BUILD_TYPE=MinSizeRel` minimizes the size of library.
- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance.
Our own tip for performance optimization to use clang and Eigen or OpenBLAS:
- `CMAKE_BUILD_TYPE=Release`
- `ANDROID_TOOLCHAIN=clang`
- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`.
### Build and Install
After running `cmake`, we can run `make; make install` to build and install.
Before building, you might want to remove the `third_party` and `build` directories including pre-built libraries for other architectures.
After building,in the directory `CMAKE_INSTALL_PREFIX`, you will find three sub-directories:
- `include`: the header file of the inference library,
- `lib`: the inference library built for various Android ABIs,
- `third_party`: dependent third-party libraries built for Android.
...@@ -27,10 +27,28 @@ iOS平台可选配置参数: ...@@ -27,10 +27,28 @@ iOS平台可选配置参数:
- `SIMULATOR`,构建目标为`x86`架构的模拟器平台。 - `SIMULATOR`,构建目标为`x86`架构的模拟器平台。
- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示: - `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示:
| IOS_PLATFORM | IOS_ARCH | <table class="docutils">
|--------------|----------------------| <colgroup>
| OS | armv7, armv7s, arm64 (默认) | <col width="35%" />
| SIMULATOR | i386, x86_64 (默认) | <col width="65%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd">
<th class="head">IOS_PLATFORM</th>
<th class="head">IOS_ARCH</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even">
<td>OS</td>
<td>armv7, armv7s, arm64 (默认)</td>
</tr>
<tr class="row-odd">
<td>SIMULATOR</td>
<td>i386, x86_64 (默认)</td>
</tr>
</tbody>
</table>
- `IOS_DEPLOYMENT_TARGET`,最小的iOS部署版本,默认值为`7.0` - `IOS_DEPLOYMENT_TARGET`,最小的iOS部署版本,默认值为`7.0`
- `IOS_ENABLE_BITCODE`,是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3),可设置`ON/OFF`,默认值为`ON` - `IOS_ENABLE_BITCODE`,是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3),可设置`ON/OFF`,默认值为`ON`
......
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_cn.md
cross_compiling_for_ios_cn.md
cross_compiling_for_raspberry_cn.md
MOBILE
======
.. toctree::
:maxdepth: 1
cross_compiling_for_android_en.md
cross_compiling_for_raspberry_en.md
...@@ -29,32 +29,32 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER} ...@@ -29,32 +29,32 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
add_dependencies(paddle_capi paddle_proto) add_dependencies(paddle_capi paddle_proto)
# TODO: paddle_capi_whole will be removed. # TODO: paddle_capi_whole will be removed.
set(PADDLE_CAPI_LAYERS_LIBS
paddle_function
paddle_gserver)
if(MOBILE_INFERENCE) if(MOBILE_INFERENCE)
set(PADDLE_CAPI_INFER_LIBS set(PADDLE_CAPI_ENGINE_LIBS
paddle_utils paddle_utils
paddle_parameter paddle_parameter
paddle_math paddle_math
paddle_cuda paddle_cuda
paddle_function paddle_proto)
paddle_gserver
paddle_proto)
else() else()
set(PADDLE_CAPI_INFER_LIBS set(PADDLE_CAPI_ENGINE_LIBS
paddle_utils paddle_utils
paddle_parameter paddle_parameter
paddle_math paddle_math
paddle_cuda paddle_cuda
paddle_function paddle_proto
paddle_gserver paddle_pserver
paddle_proto paddle_network)
paddle_pserver
paddle_network)
endif() endif()
set(PADDLE_CAPI_INFER_LIBS ${PADDLE_CAPI_LAYERS_LIBS} ${PADDLE_CAPI_ENGINE_LIBS})
cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS}) cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
# Link the static library for inference # Link the static library for inference
cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto) cc_library(paddle_capi_engine DEPS paddle_capi ${PADDLE_CAPI_ENGINE_LIBS})
cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver) cc_library(paddle_capi_layers DEPS ${PADDLE_CAPI_LAYERS_LIBS})
# Link the shared library for inference # Link the shared library for inference
if(NOT IOS) if(NOT IOS)
......
...@@ -300,4 +300,12 @@ extern void hl_matrix_col2Vol(real* dataDst, ...@@ -300,4 +300,12 @@ extern void hl_matrix_col2Vol(real* dataDst,
real alpha, real alpha,
real beta); real beta);
/**
* @brief Matrix col2Vol: Convert col matrix into 3D volume
* @param[out] out output int vector.
* @param[in] vec input float vector.
* @param[in] size size of the vector.
*/
extern void hl_vector_cast2int(int* out, real* vec, int size);
#endif /* HL_MATRIX_H_ */ #endif /* HL_MATRIX_H_ */
...@@ -133,4 +133,6 @@ inline void hl_matrix_col2Vol(real* dataDst, ...@@ -133,4 +133,6 @@ inline void hl_matrix_col2Vol(real* dataDst,
real alpha, real alpha,
real beta) {} real beta) {}
inline void hl_vector_cast2int(int* out, real* vec, int size) {}
#endif // HL_MATRIX_STUB_H_ #endif // HL_MATRIX_STUB_H_
...@@ -793,3 +793,14 @@ void hl_matrix_col2Vol(real* dataDst, ...@@ -793,3 +793,14 @@ void hl_matrix_col2Vol(real* dataDst,
CHECK_SYNC("hl_matrix_col2Vol failed"); CHECK_SYNC("hl_matrix_col2Vol failed");
} }
__global__ void keVectorCast2Int(int* out, real* vec, int size) {
for (int i = threadIdx.x; i < (size); i += blockDim.x) {
out[i] = int(vec[i]);
}
}
void hl_vector_cast2int(int* out, real* vec, int size) {
keVectorCast2Int<<<1, 512, 0, STREAM_DEFAULT>>>(out, vec, size);
CHECK_SYNC("hl_vector_cast2int failed");
}
...@@ -20,7 +20,8 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope) ...@@ -20,7 +20,8 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_library(attribute SRCS attribute.cc DEPS framework_proto) cc_library(attribute SRCS attribute.cc DEPS framework_proto)
cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc) cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
device_context)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
...@@ -44,8 +45,9 @@ add_custom_command(TARGET framework_py_proto POST_BUILD ...@@ -44,8 +45,9 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
cc_library(backward SRCS backward.cc DEPS net_op) cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table)
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
......
...@@ -117,7 +117,7 @@ int64_t DDim::operator[](int idx) const { ...@@ -117,7 +117,7 @@ int64_t DDim::operator[](int idx) const {
return boost::apply_visitor(DynamicConstIndexer(idx), var); return boost::apply_visitor(DynamicConstIndexer(idx), var);
} }
int64_t DDim::size() const { return arity(*this); } int DDim::size() const { return arity(*this); }
bool DDim::operator==(DDim d) const { bool DDim::operator==(DDim d) const {
if (var.which() != d.getVar().which()) { if (var.which() != d.getVar().which()) {
......
...@@ -71,7 +71,7 @@ struct DDim { ...@@ -71,7 +71,7 @@ struct DDim {
DDim operator*(DDim d) const; DDim operator*(DDim d) const;
int64_t size() const; int size() const;
}; };
/** /**
......
...@@ -21,7 +21,9 @@ limitations under the License. */ ...@@ -21,7 +21,9 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
...@@ -70,10 +72,14 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) { ...@@ -70,10 +72,14 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
var->GetMutable<FeedFetchList>(); var->GetMutable<FeedFetchList>();
} else if (var_type == VarDesc::STEP_SCOPES) { } else if (var_type == VarDesc::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope>>(); var->GetMutable<std::vector<framework::Scope>>();
} else if (var_type == VarDesc::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == VarDesc::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else { } else {
PADDLE_THROW( PADDLE_THROW(
"Variable type %d is not in " "Variable type %d is not in "
"[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST]", "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE]",
var_type); var_type);
} }
} }
...@@ -83,7 +89,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id, ...@@ -83,7 +89,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
// TODO(tonyyang-svail): // TODO(tonyyang-svail):
// - only runs on the first device (i.e. no interdevice communication) // - only runs on the first device (i.e. no interdevice communication)
// - will change to use multiple blocks for RNN op and Cond Op // - will change to use multiple blocks for RNN op and Cond Op
PADDLE_ENFORCE_LT(block_id, pdesc.Size()); PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
auto& block = pdesc.Block(block_id); auto& block = pdesc.Block(block_id);
auto& device = device_contexts_[0]; auto& device = device_contexts_[0];
......
...@@ -109,6 +109,11 @@ message LoDTensorDesc { ...@@ -109,6 +109,11 @@ message LoDTensorDesc {
optional int32 lod_level = 2 [ default = 0 ]; optional int32 lod_level = 2 [ default = 0 ];
} }
message LoDTensorArrayDesc {
required TensorDesc tensor = 1;
optional int32 lod_level = 2 [ default = 0 ];
}
message VarDesc { message VarDesc {
enum VarType { enum VarType {
LOD_TENSOR = 1; LOD_TENSOR = 1;
...@@ -116,11 +121,14 @@ message VarDesc { ...@@ -116,11 +121,14 @@ message VarDesc {
FEED_MINIBATCH = 3; FEED_MINIBATCH = 3;
FETCH_LIST = 4; FETCH_LIST = 4;
STEP_SCOPES = 5; STEP_SCOPES = 5;
LOD_RANK_TABLE = 6;
LOD_TENSOR_ARRAY = 7;
} }
required string name = 1; required string name = 1;
required VarType type = 2; required VarType type = 2;
optional LoDTensorDesc lod_tensor = 3; optional LoDTensorDesc lod_tensor = 3;
optional TensorDesc selected_rows = 4; optional TensorDesc selected_rows = 4;
optional LoDTensorArrayDesc tensor_array = 6;
optional bool persistable = 5 [ default = false ]; optional bool persistable = 5 [ default = false ];
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/lod_rank_table.h"
namespace paddle {
namespace framework {
void LoDRankTable::Reset(const LoD& lod, size_t level) {
this->coarse_lod_.clear();
this->items_.clear();
PADDLE_ENFORCE(level < lod.size(),
"Cannot rank lod since the level %d is less than lod size %d",
level, lod.size());
coarse_lod_.reserve(level);
for (size_t i = 0; i < level; ++i) {
coarse_lod_.push_back(lod[i]);
}
auto& vec = lod[level];
for (size_t i = 0; i < vec.size() - 1; ++i) {
TableItem item;
item.index = i;
item.length = vec[i + 1] - vec[i];
VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
items_.emplace_back(item);
}
// NOTE(yuyang18):
//
// The time complexity of stable_sort is O(N*log(N)) if additional memory is
// available. It is easy to debug and unit test when using `stable_sort`
// instead of `sort`. Also, the items of a rank table will not be too large.
std::stable_sort(items_.begin(), items_.end(),
[](const TableItem& a, const TableItem& b) {
return a.length > b.length;
});
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/lod_tensor.h"
namespace paddle {
namespace framework {
// LoD Rank Table stores the `level` of `lod` which is ordered by sequence
// length in descending order. It is useful when implement dynamic RNN and is
// shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice
// output operators.
//
// The table item contains two element. The length of sequence and the index of
// sequence in that level.
//
// LoDRankTable also stores the coarse_lod, which is the lod information whose
// level is less than input level, in order to restore the output LoD
// information.
class LoDRankTable {
public:
struct TableItem {
size_t index;
size_t length;
};
LoDRankTable() {}
void Reset(const LoD& lod, size_t level);
const std::vector<TableItem>& items() const { return this->items_; }
const LoD& coarse_lod() const { return this->coarse_lod_; }
size_t level() const { return coarse_lod_.size(); }
private:
LoD coarse_lod_;
std::vector<TableItem> items_;
};
} // namespace framework
} // namespace paddle
...@@ -27,6 +27,20 @@ ...@@ -27,6 +27,20 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::ostream& operator<<(std::ostream& os, const LoD& lod) {
os << "{";
for (auto& v : lod) {
os << "{";
for (auto& i : v) {
os << i << ",";
}
os << "}";
}
os << "}";
return os;
}
LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) { LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
LoD new_lod; LoD new_lod;
new_lod.reserve(level_end - level_begin); new_lod.reserve(level_end - level_begin);
...@@ -135,5 +149,41 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin, ...@@ -135,5 +149,41 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty."); PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
ShareDataWith(Slice(begin, end)); ShareDataWith(Slice(begin, end));
} }
using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
size_t end_idx, size_t start_level) {
LoD sub_lod;
for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
PADDLE_ENFORCE_LE(start_idx, end_idx);
PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
std::vector<size_t> level_lens;
for (size_t i = start_idx; i < end_idx; ++i) {
level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
}
sub_lod.emplace_back(level_lens);
start_idx = lod[level_idx][start_idx];
end_idx = lod[level_idx][end_idx];
}
return LoDAndOffset{sub_lod, {start_idx, end_idx}};
}
void AppendLoD(LoD* lod, const LoD& lod_length) {
PADDLE_ENFORCE(
lod->empty() || lod->size() == lod_length.size(),
"The lod_length should has the same size with the appended lod.");
if (lod->empty()) {
*lod = LoD(lod_length.size(), std::vector<size_t>({0}));
}
for (size_t i = 0; i < lod->size(); ++i) {
auto& level = (*lod)[i];
for (size_t len : lod_length[i]) {
level.push_back(level.back() + len);
}
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -56,6 +56,8 @@ using Vector = thrust::host_vector< ...@@ -56,6 +56,8 @@ using Vector = thrust::host_vector<
*/ */
using LoD = std::vector<Vector<size_t>>; using LoD = std::vector<Vector<size_t>>;
std::ostream& operator<<(std::ostream& os, const LoD& lod);
/* /*
* Slice levels from a LoD. * Slice levels from a LoD.
* NOTE the lowest level should always be the absolute offsets of the underlying * NOTE the lowest level should always be the absolute offsets of the underlying
...@@ -181,5 +183,10 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level, ...@@ -181,5 +183,10 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
return tensor; return tensor;
} }
std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
void AppendLoD(LoD* lod, const LoD& lod_length);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -140,19 +140,9 @@ Similarly, the lengths in the top level LoD ...@@ -140,19 +140,9 @@ Similarly, the lengths in the top level LoD
are transformed into offsets of elements/words as follows: are transformed into offsets of elements/words as follows:
``` ```
0 9 10 15 0 3 4 6
= = = = = =
3+2+4 1+9 2+3+10 3 3+1 4+2
```
so we can tell that the first article is from word 0 to word 9, and the second article is from word 9 to word 10.
The complete offset representation is as follows:
```
0 9 10 15
0 3 5 9 10 12 15
||| || |||| | || |||
``` ```
## Slicing of LoD Tensors ## Slicing of LoD Tensors
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/framework/lod_tensor.h"
namespace paddle {
namespace framework {
using LoDTensorArray = std::vector<LoDTensor>;
}
} // namespace paddle
...@@ -144,5 +144,48 @@ TEST(LodExpand, test) { ...@@ -144,5 +144,48 @@ TEST(LodExpand, test) {
} }
} }
TEST(LoD, GetFineGrainedLoDLength) {
LoD lod;
lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
lod.push_back(
std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29}));
auto lod_and_offset =
paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0);
LoD lod_length = lod_and_offset.first;
size_t start_offset = lod_and_offset.second.first;
size_t end_offset = lod_and_offset.second.second;
LoD expected;
expected.push_back(std::vector<size_t>{2});
expected.push_back(std::vector<size_t>{2, 2});
expected.push_back(std::vector<size_t>{2, 3, 4, 2});
EXPECT_EQ(lod_length, expected);
EXPECT_EQ(start_offset, 15UL);
EXPECT_EQ(end_offset, 26UL);
}
TEST(LoD, AppendLoD) {
LoD lod_lens;
lod_lens.push_back(std::vector<size_t>({2}));
lod_lens.push_back(std::vector<size_t>({2, 2}));
lod_lens.push_back(std::vector<size_t>({2, 3, 4, 2}));
LoD origin;
origin.push_back(std::vector<size_t>({0, 2}));
origin.push_back(std::vector<size_t>({0, 1, 6}));
origin.push_back(std::vector<size_t>({0, 2, 5, 7, 10, 12, 15}));
paddle::framework::AppendLoD(&origin, lod_lens);
LoD expected;
expected.push_back(std::vector<size_t>({0, 2, 4}));
expected.push_back(std::vector<size_t>({0, 1, 6, 8, 10}));
expected.push_back(
std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26}));
EXPECT_EQ(origin, expected);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -67,8 +67,11 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -67,8 +67,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
out); out);
in_var->SetLoDLevel(out_var->GetLodLevel()); in_var->SetLoDLevel(out_var->GetLodLevel());
} }
bool IsRuntime() const override;
protected:
VarDesc::VarType GetVarType(const std::string &name) const override;
private:
DDim GetDim(const std::string &name) const override; DDim GetDim(const std::string &name) const override;
void SetDim(const std::string &name, const DDim &dim) override; void SetDim(const std::string &name, const DDim &dim) override;
...@@ -349,6 +352,9 @@ void OpDescBind::InferVarType(BlockDescBind *block) const { ...@@ -349,6 +352,9 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
info.infer_var_type_(*this, block); info.infer_var_type_(*this, block);
} else { } else {
// all output type is LoDTensor by default // all output type is LoDTensor by default
VLOG(10) << this->Type()
<< " has not registered InferVarType. Set output variables to "
"LOD_TENSOR";
for (auto &out_pair : this->outputs_) { for (auto &out_pair : this->outputs_) {
for (auto &out_var_name : out_pair.second) { for (auto &out_var_name : out_pair.second) {
block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR); block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
...@@ -448,6 +454,12 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name, ...@@ -448,6 +454,12 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
const DDim &dim) { const DDim &dim) {
block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim)); block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
} }
bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
const std::string &name) const {
return block_.FindVarRecursive(name)->GetType();
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -92,8 +92,7 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> { ...@@ -92,8 +92,7 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
void operator()(const char* op_type) const { void operator()(const char* op_type) const {
using T = typename KERNEL_TYPE::ELEMENT_TYPE; using T = typename KERNEL_TYPE::ELEMENT_TYPE;
OperatorWithKernel::OpKernelKey key(ToDataType(std::type_index(typeid(T))), OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType());
PlaceType());
OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE); OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value; constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
......
...@@ -15,7 +15,9 @@ limitations under the License. */ ...@@ -15,7 +15,9 @@ limitations under the License. */
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
#include <algorithm> #include <algorithm>
#include <atomic> #include <atomic>
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/shape_inference.h" #include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -252,8 +254,7 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>( ...@@ -252,8 +254,7 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
return res; return res;
} }
std::ostream& operator<<(std::ostream& os, std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key) {
const OperatorWithKernel::OpKernelKey& kernel_key) {
os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_ os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
<< "]"; << "]";
return os; return os;
...@@ -365,7 +366,9 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -365,7 +366,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
out_tensor->set_lod(in_tensor.lod()); out_tensor->set_lod(in_tensor.lod());
} }
private: bool IsRuntime() const override { return true; }
protected:
DDim GetDim(const std::string& name) const override { DDim GetDim(const std::string& name) const override {
Variable* var = scope_.FindVar(name); Variable* var = scope_.FindVar(name);
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
...@@ -388,6 +391,12 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -388,6 +391,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
} }
VarDesc::VarType GetVarType(const std::string& name) const override {
auto* var = scope_.FindVar(name);
return ToVarType(var->Type());
}
private:
const OperatorBase& op_; const OperatorBase& op_;
const Scope& scope_; const Scope& scope_;
}; };
...@@ -422,7 +431,7 @@ void OperatorWithKernel::Run(const Scope& scope, ...@@ -422,7 +431,7 @@ void OperatorWithKernel::Run(const Scope& scope,
// check if op[type] have kernel for kernel_key // check if op[type] have kernel for kernel_key
OpKernelMap& kernels = kernels_iter->second; OpKernelMap& kernels = kernels_iter->second;
auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx); auto kernel_key = GetKernelType(ctx);
auto kernel_iter = kernels.find(kernel_key); auto kernel_iter = kernels.find(kernel_key);
if (kernel_iter == kernels.end()) { if (kernel_iter == kernels.end()) {
...@@ -430,6 +439,41 @@ void OperatorWithKernel::Run(const Scope& scope, ...@@ -430,6 +439,41 @@ void OperatorWithKernel::Run(const Scope& scope,
} }
kernel_iter->second->Compute(ctx); kernel_iter->second->Compute(ctx);
// throws errors if have.
dev_ctx.Finish();
}
OpKernelType OperatorWithKernel::GetKernelType(
const ExecutionContext& ctx) const {
return OpKernelType(IndicateDataType(ctx), ctx.device_context());
}
DataType OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const {
auto& scope = ctx.scope();
int data_type = -1;
for (auto& input : this->inputs_) {
for (auto& ipt_name : input.second) {
auto* var = scope.FindVar(ipt_name);
if (var != nullptr) {
const Tensor* t = nullptr;
if (var->IsType<Tensor>()) {
t = &var->Get<Tensor>();
} else if (var->IsType<LoDTensor>()) {
t = &var->Get<LoDTensor>();
} else if (var->IsType<SelectedRows>()) {
t = &(var->Get<SelectedRows>().value());
}
if (t != nullptr) {
int tmp = static_cast<int>(ToDataType(t->type()));
PADDLE_ENFORCE(tmp == data_type || data_type == -1,
"DataType of Paddle Op %s must be the same.", Type());
data_type = tmp;
}
}
}
}
PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
return static_cast<DataType>(data_type);
} }
} // namespace framework } // namespace framework
......
...@@ -298,11 +298,10 @@ class ExecutionContext { ...@@ -298,11 +298,10 @@ class ExecutionContext {
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
const platform::CUDADeviceContext& cuda_device_context() const { const inline platform::CUDADeviceContext& cuda_device_context() const {
PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
auto cuda_ctx = return *reinterpret_cast<const platform::CUDADeviceContext*>(
reinterpret_cast<const platform::CUDADeviceContext*>(&device_context_); &device_context_);
return *cuda_ctx;
} }
#endif #endif
...@@ -346,27 +345,10 @@ class OpKernel : public OpKernelBase { ...@@ -346,27 +345,10 @@ class OpKernel : public OpKernelBase {
using ELEMENT_TYPE = T; using ELEMENT_TYPE = T;
}; };
class OperatorWithKernel : public OperatorBase { struct OpKernelType {
public: struct Hash {
struct OpKernelKey {
platform::Place place_;
DataType data_type_;
OpKernelKey(DataType data_type, platform::Place place)
: place_(place), data_type_(data_type) {}
OpKernelKey(DataType data_type, const platform::DeviceContext& dev_ctx)
: place_(dev_ctx.GetPlace()), data_type_(data_type) {}
bool operator==(const OpKernelKey& o) const {
return platform::places_are_same_class(place_, o.place_) &&
data_type_ == o.data_type_;
}
};
struct OpKernelHash {
std::hash<int> hash_; std::hash<int> hash_;
size_t operator()(const OpKernelKey& key) const { size_t operator()(const OpKernelType& key) const {
int place = key.place_.which(); int place = key.place_.which();
int data_type = static_cast<int>(key.data_type_); int data_type = static_cast<int>(key.data_type_);
int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT | int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
...@@ -375,9 +357,26 @@ class OperatorWithKernel : public OperatorBase { ...@@ -375,9 +357,26 @@ class OperatorWithKernel : public OperatorBase {
} }
}; };
platform::Place place_;
DataType data_type_;
OpKernelType(DataType data_type, platform::Place place)
: place_(place), data_type_(data_type) {}
OpKernelType(DataType data_type, const platform::DeviceContext& dev_ctx)
: place_(dev_ctx.GetPlace()), data_type_(data_type) {}
bool operator==(const OpKernelType& o) const {
return platform::places_are_same_class(place_, o.place_) &&
data_type_ == o.data_type_;
}
};
class OperatorWithKernel : public OperatorBase {
public:
using OpKernelMap = using OpKernelMap =
std::unordered_map<OpKernelKey, std::unique_ptr<OpKernelBase>, std::unordered_map<OpKernelType, std::unique_ptr<OpKernelBase>,
OpKernelHash>; OpKernelType::Hash>;
OperatorWithKernel(const std::string& type, const VariableNameMap& inputs, OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs) const VariableNameMap& outputs, const AttributeMap& attrs)
...@@ -405,42 +404,15 @@ class OperatorWithKernel : public OperatorBase { ...@@ -405,42 +404,15 @@ class OperatorWithKernel : public OperatorBase {
} }
protected: protected:
virtual OpKernelType GetKernelType(const ExecutionContext& ctx) const;
private:
// indicate kernel DataType by input data. Defaultly all input data must be // indicate kernel DataType by input data. Defaultly all input data must be
// same. // same.
virtual DataType IndicateDataType(const ExecutionContext& ctx) const { DataType IndicateDataType(const ExecutionContext& ctx) const;
VLOG(3) << "Default IndicateDataType " << this->Type();
auto& scope = ctx.scope();
int data_type = -1;
for (auto& input : this->inputs_) {
for (auto& ipt_name : input.second) {
auto* var = scope.FindVar(ipt_name);
if (var != nullptr) {
const Tensor* t = nullptr;
if (var->IsType<Tensor>()) {
t = &var->Get<Tensor>();
} else if (var->IsType<LoDTensor>()) {
t = &var->Get<LoDTensor>();
} else if (var->IsType<SelectedRows>()) {
t = &(var->Get<SelectedRows>().value());
}
if (t != nullptr) {
int tmp = static_cast<int>(ToDataType(t->type()));
VLOG(3) << "Input " << ipt_name << " with data_type " << tmp;
PADDLE_ENFORCE(tmp == data_type || data_type == -1,
"DataType of Paddle Op %s must be the same.",
Type());
data_type = tmp;
}
}
}
}
PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
return static_cast<DataType>(data_type);
}
}; };
std::ostream& operator<<(std::ostream& os, std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key);
const OperatorWithKernel::OpKernelKey& kernel_key);
extern bool OpSupportGPU(const std::string& op_type); extern bool OpSupportGPU(const std::string& op_type);
......
...@@ -114,8 +114,8 @@ class OpWithKernelTest : public OperatorWithKernel { ...@@ -114,8 +114,8 @@ class OpWithKernelTest : public OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContext* ctx) const override {} void InferShape(framework::InferShapeContext* ctx) const override {}
DataType IndicateDataType(const ExecutionContext& ctx) const override { OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
return DataType::FP32; return OpKernelType(DataType::FP32, ctx.device_context());
} }
}; };
......
...@@ -46,6 +46,23 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names, ...@@ -46,6 +46,23 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
SetDim(names[i], dims[i]); SetDim(names[i], dims[i]);
} }
} }
std::vector<VarDesc::VarType> InferShapeContext::GetInputsVarType(
const std::string &name) const {
return GetVarTypes(Inputs(name));
}
std::vector<VarDesc::VarType> InferShapeContext::GetOutputsVarType(
const std::string &name) const {
return GetVarTypes(Outputs(name));
}
std::vector<VarDesc::VarType> InferShapeContext::GetVarTypes(
const std::vector<std::string> &names) const {
std::vector<VarDesc::VarType> retv;
retv.resize(names.size());
std::transform(names.begin(), names.end(), retv.begin(),
std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
std::placeholders::_1));
return retv;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/framework/attribute.h" #include "paddle/framework/attribute.h"
#include "paddle/framework/ddim.h" #include "paddle/framework/ddim.h"
#include "paddle/framework/framework.pb.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -26,6 +27,10 @@ class InferShapeContext { ...@@ -26,6 +27,10 @@ class InferShapeContext {
virtual bool HasInput(const std::string &name) const = 0; virtual bool HasInput(const std::string &name) const = 0;
virtual bool HasOutput(const std::string &name) const = 0; virtual bool HasOutput(const std::string &name) const = 0;
std::vector<VarDesc::VarType> GetInputsVarType(const std::string &name) const;
std::vector<VarDesc::VarType> GetOutputsVarType(
const std::string &name) const;
virtual bool HasInputs(const std::string &name) const = 0; virtual bool HasInputs(const std::string &name) const = 0;
virtual bool HasOutputs(const std::string &name) const = 0; virtual bool HasOutputs(const std::string &name) const = 0;
...@@ -46,6 +51,8 @@ class InferShapeContext { ...@@ -46,6 +51,8 @@ class InferShapeContext {
virtual void ShareLoD(const std::string &in, const std::string &out, virtual void ShareLoD(const std::string &in, const std::string &out,
size_t i = 0, size_t j = 0) const = 0; size_t i = 0, size_t j = 0) const = 0;
virtual bool IsRuntime() const = 0;
protected: protected:
virtual framework::DDim GetDim(const std::string &name) const = 0; virtual framework::DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
...@@ -55,6 +62,11 @@ class InferShapeContext { ...@@ -55,6 +62,11 @@ class InferShapeContext {
void SetDims(const std::vector<std::string> &names, void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims); const std::vector<framework::DDim> &dims);
std::vector<VarDesc::VarType> GetVarTypes(
const std::vector<std::string> &names) const;
virtual VarDesc::VarType GetVarType(const std::string &name) const = 0;
}; };
} // namespace framework } // namespace framework
......
...@@ -52,7 +52,7 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> { ...@@ -52,7 +52,7 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
}; };
static inline size_t SizeOfType(std::type_index type) { static inline size_t SizeOfType(std::type_index type) {
SizeOfTypeFunctor<int, float, double, int16_t, int64_t> functor; SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool> functor;
size_t size = functor(type); size_t size = functor(type);
PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
return size; return size;
......
...@@ -37,13 +37,29 @@ std::vector<int64_t> VarDescBind::Shape() const { ...@@ -37,13 +37,29 @@ std::vector<int64_t> VarDescBind::Shape() const {
DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); } DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }
void VarDescBind::SetLoDLevel(int32_t lod_level) { void VarDescBind::SetLoDLevel(int32_t lod_level) {
PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR); switch (desc_.type()) {
desc_.mutable_lod_tensor()->set_lod_level(lod_level); case VarDesc::LOD_TENSOR:
desc_.mutable_lod_tensor()->set_lod_level(lod_level);
break;
case VarDesc::LOD_TENSOR_ARRAY:
desc_.mutable_tensor_array()->set_lod_level(lod_level);
break;
default:
PADDLE_THROW("Tensor type=%d does not support LoDLevel",
desc_.tensor_array().lod_level());
}
} }
int32_t VarDescBind::GetLodLevel() const { int32_t VarDescBind::GetLodLevel() const {
PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR); switch (desc_.type()) {
return desc_.lod_tensor().lod_level(); case VarDesc::LOD_TENSOR:
return desc_.lod_tensor().lod_level();
case VarDesc::LOD_TENSOR_ARRAY:
return desc_.tensor_array().lod_level();
default:
PADDLE_THROW("Tensor type=%d does not support LoDLevel",
desc_.tensor_array().lod_level());
}
} }
const TensorDesc &VarDescBind::tensor_desc() const { const TensorDesc &VarDescBind::tensor_desc() const {
...@@ -53,6 +69,8 @@ const TensorDesc &VarDescBind::tensor_desc() const { ...@@ -53,6 +69,8 @@ const TensorDesc &VarDescBind::tensor_desc() const {
return desc_.selected_rows(); return desc_.selected_rows();
case VarDesc::LOD_TENSOR: case VarDesc::LOD_TENSOR:
return desc_.lod_tensor().tensor(); return desc_.lod_tensor().tensor();
case VarDesc::LOD_TENSOR_ARRAY:
return desc_.tensor_array().tensor();
default: default:
PADDLE_THROW("Unexpected branch."); PADDLE_THROW("Unexpected branch.");
} }
...@@ -66,6 +84,8 @@ TensorDesc *VarDescBind::mutable_tensor_desc() { ...@@ -66,6 +84,8 @@ TensorDesc *VarDescBind::mutable_tensor_desc() {
return desc_.mutable_selected_rows(); return desc_.mutable_selected_rows();
case VarDesc::LOD_TENSOR: case VarDesc::LOD_TENSOR:
return desc_.mutable_lod_tensor()->mutable_tensor(); return desc_.mutable_lod_tensor()->mutable_tensor();
case VarDesc::LOD_TENSOR_ARRAY:
return desc_.mutable_tensor_array()->mutable_tensor();
default: default:
PADDLE_THROW("Unexpected branch."); PADDLE_THROW("Unexpected branch.");
} }
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "glog/logging.h"
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
namespace paddle { namespace paddle {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h"
namespace paddle {
namespace framework {
inline VarDesc::VarType ToVarType(std::type_index type) {
if (type.hash_code() == typeid(LoDTensor).hash_code()) {
return VarDesc_VarType_LOD_TENSOR;
} else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
return VarDesc_VarType_LOD_RANK_TABLE;
} else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
return VarDesc_VarType_LOD_TENSOR_ARRAY;
} else {
PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
}
}
} // namespace framework
} // namespace paddle
...@@ -48,6 +48,11 @@ class Variable { ...@@ -48,6 +48,11 @@ class Variable {
void Clear() { holder_.reset(); } void Clear() { holder_.reset(); }
std::type_index Type() const {
PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
return holder_->Type();
}
private: private:
struct Placeholder { struct Placeholder {
virtual ~Placeholder() {} virtual ~Placeholder() {}
......
...@@ -395,14 +395,24 @@ real AucEvaluator::evalImp(std::vector<Argument>& arguments) { ...@@ -395,14 +395,24 @@ real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
CHECK_LE(arguments.size(), (size_t)3); CHECK_LE(arguments.size(), (size_t)3);
MatrixPtr output = arguments[0].value; MatrixPtr output = arguments[0].value;
IVectorPtr label = arguments[1].ids; IVectorPtr label = arguments[1].ids;
MatrixPtr labelval = arguments[1].value;
bool supportWeight = (3 == arguments.size()) ? true : false; bool supportWeight = (3 == arguments.size()) ? true : false;
MatrixPtr weight = supportWeight ? arguments[2].value : nullptr; MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
if (nullptr == output || nullptr == label ||
(supportWeight && nullptr == weight)) { if (nullptr == output || (supportWeight && nullptr == weight)) {
return 0; return 0;
} }
size_t insNum = output->getHeight(); size_t insNum = output->getHeight();
size_t outputDim = output->getWidth(); size_t outputDim = output->getWidth();
// Copy label from value to a vector.
if (nullptr == label && nullptr != labelval) {
// label width is 1
CHECK_EQ(1U, labelval->getWidth());
VectorPtr vec =
Vector::create(labelval->getData(), insNum, output->useGpu());
label = vec->castToInt();
}
CHECK_EQ(insNum, label->getSize()); CHECK_EQ(insNum, label->getSize());
if (supportWeight) { if (supportWeight) {
CHECK_EQ(insNum, weight->getHeight()); CHECK_EQ(insNum, weight->getHeight());
...@@ -443,6 +453,7 @@ real AucEvaluator::evalImp(std::vector<Argument>& arguments) { ...@@ -443,6 +453,7 @@ real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
int* labelD = label->getData(); int* labelD = label->getData();
real* weightD = supportWeight ? weight->getData() : nullptr; real* weightD = supportWeight ? weight->getData() : nullptr;
size_t pos = realColumnIdx_; size_t pos = realColumnIdx_;
for (size_t i = 0; i < insNum; ++i) { for (size_t i = 0; i < insNum; ++i) {
real value = outputD[pos]; real value = outputD[pos];
uint32_t binIdx = static_cast<uint32_t>(value * kBinNum_); uint32_t binIdx = static_cast<uint32_t>(value * kBinNum_);
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
ThreadLocalD<std::vector<MemoryHandle *>> ConvBaseProjection::convMem_; ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config, ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
ParameterPtr parameter, ParameterPtr parameter,
...@@ -175,18 +175,18 @@ void ConvBaseProjection::reshape(int batchSize) { ...@@ -175,18 +175,18 @@ void ConvBaseProjection::reshape(int batchSize) {
} }
void *ConvBaseProjection::getSpaceBytes(size_t size) { void *ConvBaseProjection::getSpaceBytes(size_t size) {
std::vector<MemoryHandle *> &convMem = *convMem_; std::vector<MemoryHandlePtr> &convMem = *convMem_;
if (convMem.empty()) { if (convMem.empty()) {
int numDevices = hl_get_device_count(); int numDevices = hl_get_device_count();
convMem.resize(numDevices); convMem.resize(numDevices);
} }
int devId = hl_get_device(); int devId = hl_get_device();
MemoryHandle **localMem = &(convMem[devId]); MemoryHandlePtr localMem = convMem[devId];
if (NULL == *localMem || size > (*localMem)->getAllocSize()) { if (NULL == localMem || size > localMem->getAllocSize()) {
*localMem = new GpuMemoryHandle(size); localMem = std::make_shared<GpuMemoryHandle>(size);
} }
return (*localMem)->getBuf(); return localMem->getBuf();
} }
ConvBaseProjection::~ConvBaseProjection() { ConvBaseProjection::~ConvBaseProjection() {
......
...@@ -105,7 +105,7 @@ protected: ...@@ -105,7 +105,7 @@ protected:
bool bias_; bool bias_;
std::unique_ptr<Weight> weight_; std::unique_ptr<Weight> weight_;
static ThreadLocalD<std::vector<MemoryHandle*>> convMem_; static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
}; };
} // namespace paddle } // namespace paddle
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLDNNAddtoLayer.h"
using namespace mkldnn; // NOLINT
namespace paddle {
REGISTER_LAYER(mkldnn_addto, MKLDNNAddtoLayer);
bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
if (!MKLDNNLayer::init(layerMap, parameterMap)) {
return false;
}
layerSize_ = getSize();
for (size_t i = 0; i < inputLayers_.size(); i++) {
CHECK_EQ(layerSize_, inputLayers_[i]->getSize()) << "input size must equal";
}
if (biasParameter_.get() != NULL) {
biases_ =
std::unique_ptr<Weight>(new Weight(1, layerSize_, biasParameter_, 0));
}
return true;
}
void MKLDNNAddtoLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
reshapeInput(bs, ih, iw);
ic = inputLayers_[0]->getSize() / ih / iw;
CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
for (size_t i = 0; i < inputLayers_.size(); i++) {
CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
}
oc = ic;
oh = ih;
ow = iw;
reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow);
printSizeInfo();
}
void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
if (biases_) {
LOG(FATAL) << "not implemented yet";
}
resetFwdBuffers(inVals_, out);
in = inVals_[0];
std::shared_ptr<sum::primitive_desc> fwdPD;
resetFwdPD(fwdPD, inVals_, out);
resetFwdPipeline(pipeline, fwdPD, inVals_, out);
}
void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
resetBwdBuffers(inGrads_, out);
in = inGrads_[0];
// backward only need share output grad to input grad
for (size_t i = 0; i < inGrads_.size(); i++) {
if (inGrads_[i] != nullptr) {
inGrads_[i] = out;
inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
}
}
}
void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
if (biases_ && biases_->getWGrad()) {
biases_->getParameterPtr()->incUpdate(callback);
}
}
void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
resetInValue(inputs[i], nullptr, i);
CHECK(inputs[i]);
inputs[i]->downSpatial();
}
for (size_t i = 1; i < inputs.size(); i++) {
CHECK_PRIMITIVE_DESC_EQ(inputs[i], inputs[0]->getPrimitiveDesc());
}
resetOutValue(out, inputs[0]->getPrimitiveDesc());
}
void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out) {
std::vector<double> scales(inputs.size(), 1.0);
std::vector<memory::primitive_desc> srcPDs;
for (size_t i = 0; i < inputs.size(); i++) {
srcPDs.push_back(inputs[i]->getPrimitiveDesc());
}
CHECK(out);
pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
}
void MKLDNNAddtoLayer::resetFwdPipeline(
std::vector<primitive>& pipeline,
std::shared_ptr<sum::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
std::vector<primitive::at> srcs;
for (size_t i = 0; i < inputs.size(); i++) {
srcs.push_back(*(inputs[i]));
}
fwd_.reset(new sum(*pd, srcs, *out));
pipeline.push_back(*fwd_);
}
void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
CHECK(outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
CHECK(out);
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
}
}
} // namespace paddle
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "MKLDNNLayer.h"
#include "mkldnn.hpp"
namespace paddle {
/**
* @brief A subclass of MKLDNNLayer Addto layer.
*
* The config file api is mkldnn_addto
*/
class MKLDNNAddtoLayer : public MKLDNNLayer {
protected:
std::vector<MKLDNNMatrixPtr> inVals_;
std::vector<MKLDNNMatrixPtr> inGrads_;
// layer size == ic * ih * iw == oc * oh *ow, and can not be changed
size_t layerSize_;
// TODO(TJ): this part has not been optimized by MKL-DNN
std::unique_ptr<Weight> biases_;
public:
explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
~MKLDNNAddtoLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void updateWeights(const UpdateCallback& callback) override;
void printValueFormat() override {
for (size_t i = 0; i < inVals_.size(); ++i) {
VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
}
if (outVal_) {
VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
}
if (extOutVal_) {
VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
}
}
void printGradFormat() override {
if (extOutGrad_) {
VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
}
if (outGrad_) {
VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
}
for (size_t i = 0; i < inGrads_.size(); ++i) {
VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
}
}
protected:
/**
* Forward functions: reset buffers(inputs, output, bias),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out);
void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(inputs, output, bias)
*/
void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
};
} // namespace paddle
...@@ -60,18 +60,16 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() { ...@@ -60,18 +60,16 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
} }
CHECK(wgtVal_) << "should have been initialized"; CHECK(wgtVal_) << "should have been initialized";
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims(); auto targetDim = wgtVal_->getDims();
auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo; auto srcFmt = targetDim.size() == 2 ? format::io : format::ihwo;
wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim); wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
hasInitedWgt_ = true; hasInitedWgt_ = true;
} }
void MKLDNNFcLayer::convertWeightsToPaddle() { void MKLDNNFcLayer::convertWeightsToPaddle() {
CHECK(wgtVal_) << "should have been initialized"; CHECK(wgtVal_) << "should have been initialized";
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims(); auto targetDim = wgtVal_->getDims();
auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo; auto dstFmt = targetDim.size() == 2 ? format::io : format::ihwo;
wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim); wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
} }
......
...@@ -77,7 +77,7 @@ void MKLDNNLayer::forward(PassType passType) { ...@@ -77,7 +77,7 @@ void MKLDNNLayer::forward(PassType passType) {
needResetBwd_ = true; needResetBwd_ = true;
} }
if (inputLayers_[0]->getType() == "data") { if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
// Update input value data when input layer is "data" type, // Update input value data when input layer is "data" type,
// since the input value data address might be changed. // since the input value data address might be changed.
CHECK(extInVal_); CHECK(extInVal_);
...@@ -171,29 +171,27 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn, ...@@ -171,29 +171,27 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
} }
void MKLDNNLayer::resetInValue( void MKLDNNLayer::resetInValue(
MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) { MKLDNNMatrixPtr& in,
const std::shared_ptr<memory::primitive_desc>& intPD,
size_t inputIdx) {
cvtInVal_ = nullptr; cvtInVal_ = nullptr;
extInVal_ = nullptr; extInVal_ = nullptr;
in = nullptr; in = nullptr;
CHECK_GT(bs_ * ic_ * ih_ * iw_, 0); CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
auto extPD = MKLDNNMatrix::createPrimitiveDesc( auto extPD = MKLDNNMatrix::createPrimitiveDesc(
{bs_, ic_, ih_, iw_}, format::nchw, engine_); {bs_, ic_, ih_, iw_}, format::nchw, engine_);
const MatrixPtr& inMat = inputLayers_[0]->getOutputValue(); const MatrixPtr& inMat = inputLayers_[inputIdx]->getOutputValue();
in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat); extInVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr); CHECK_EQ(inputIsOnlyMKLDNN(), extInVal_ != nullptr);
if (in == nullptr || in->getFormat() == format::nc) { if (extInVal_ == nullptr || extInVal_->getFormat() == format::nc) {
in = MKLDNNMatrix::create(extPD, inMat); extInVal_ = MKLDNNMatrix::create(extPD, inMat);
}
extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
if (in->getFormat() == format::nc) {
CHECK(ih_ == 1 && iw_ == 1);
} }
in = extInVal_;
if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) { if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
return; return;
} }
// need create reorder // need create reorder
in = MKLDNNMatrix::create(*intPD); in = MKLDNNMatrix::create(*intPD);
extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in); cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
CHECK(cvtInVal_) << "should not be emptry"; CHECK(cvtInVal_) << "should not be emptry";
} }
...@@ -216,11 +214,12 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out, ...@@ -216,11 +214,12 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
} }
void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in, void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
memory::primitive_desc intPD) { memory::primitive_desc intPD,
size_t inputIdx) {
cvtInGrad_ = nullptr; cvtInGrad_ = nullptr;
extInGrad_ = nullptr; extInGrad_ = nullptr;
in = nullptr; in = nullptr;
LayerPtr& input = inputLayers_[0]; LayerPtr& input = inputLayers_[inputIdx];
if (input->getOutputGrad() == nullptr) { if (input->getOutputGrad() == nullptr) {
// no need input grad // no need input grad
return; return;
...@@ -245,7 +244,6 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in, ...@@ -245,7 +244,6 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
return; return;
} }
// need create reorder // need create reorder
// TODO(TJ): add macro definition to simplify it
CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat())) CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
<< "should have external input value and the format must be nchw(nc)"; << "should have external input value and the format must be nchw(nc)";
extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat); extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
......
...@@ -199,7 +199,8 @@ protected: ...@@ -199,7 +199,8 @@ protected:
*/ */
void resetInValue( void resetInValue(
MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& in,
const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr); const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
size_t inputIdx = 0);
/** /**
* reset output value from internal primitive desc. * reset output value from internal primitive desc.
...@@ -212,7 +213,9 @@ protected: ...@@ -212,7 +213,9 @@ protected:
* reset input grad from internal primitive desc. * reset input grad from internal primitive desc.
* reset both internal and external buffer and create reorder if necessary. * reset both internal and external buffer and create reorder if necessary.
*/ */
void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD); void resetInGrad(MKLDNNMatrixPtr& in,
mkldnn::memory::primitive_desc intPD,
size_t inputIdx = 0);
/** /**
* reset output grad from internal primitive desc. * reset output grad from internal primitive desc.
......
...@@ -98,8 +98,19 @@ void SubSequenceLayer::forward(PassType passType) { ...@@ -98,8 +98,19 @@ void SubSequenceLayer::forward(PassType passType) {
CHECK_EQ(numSequences2, numSequences3); CHECK_EQ(numSequences2, numSequences3);
MatrixPtr inputValue = input.value; MatrixPtr inputValue = input.value;
IVectorPtr offsetValue = offsetSeq.ids; IVectorPtr offsetValue;
IVectorPtr sizeValue = sizeSeq.ids; IVectorPtr sizeValue;
if (useGpu_) {
// copy to cpu
IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
offsetValue->copyFrom(*offsetSeq.ids);
sizeValue->copyFrom(*sizeSeq.ids);
} else {
offsetValue = offsetSeq.ids;
sizeValue = sizeSeq.ids;
}
CHECK_EQ(offsetValue->getSize(), numSequences1); CHECK_EQ(offsetValue->getSize(), numSequences1);
CHECK_EQ(sizeValue->getSize(), numSequences1); CHECK_EQ(sizeValue->getSize(), numSequences1);
...@@ -176,8 +187,21 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) { ...@@ -176,8 +187,21 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) {
size_t numSequences1 = startPositions1->getSize() - 1; size_t numSequences1 = startPositions1->getSize() - 1;
const int* starts1 = startPositions1->getData(); const int* starts1 = startPositions1->getData();
IVectorPtr offsetValue = getInput(1).ids; const Argument& offsetSeq = getInput(1);
IVectorPtr sizeValue = getInput(2).ids; const Argument& sizeSeq = getInput(2);
IVectorPtr offsetValue;
IVectorPtr sizeValue;
if (useGpu_) {
// copy to cpu
IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
offsetValue->copyFrom(*offsetSeq.ids);
sizeValue->copyFrom(*sizeSeq.ids);
} else {
offsetValue = offsetSeq.ids;
sizeValue = sizeSeq.ids;
}
int* offsets = offsetValue->getData(); int* offsets = offsetValue->getData();
int* sizes = sizeValue->getData(); int* sizes = sizeValue->getData();
......
...@@ -132,7 +132,7 @@ void MKLDNNTester::checkForward() { ...@@ -132,7 +132,7 @@ void MKLDNNTester::checkForward() {
VLOG(MKLDNN_TESTS) << "Check Forward"; VLOG(MKLDNN_TESTS) << "Check Forward";
printTopDatas(); printTopDatas();
double delta = double delta =
compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue()); compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
EXPECT_LE(fabs(delta), eps_); EXPECT_LE(fabs(delta), eps_);
} }
...@@ -147,7 +147,7 @@ void MKLDNNTester::checkBackwardData() { ...@@ -147,7 +147,7 @@ void MKLDNNTester::checkBackwardData() {
VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i; VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
printMatrix(refDiff); printMatrix(refDiff);
double delta = compareMatrix(dnnDiff, refDiff); double delta = compareMatrix(refDiff, dnnDiff);
EXPECT_LE(fabs(delta), eps_); EXPECT_LE(fabs(delta), eps_);
if (isBN) { if (isBN) {
// the other two inputs in batch norm are for moving mean and var // the other two inputs in batch norm are for moving mean and var
...@@ -177,7 +177,7 @@ void MKLDNNTester::checkBackwardWgts() { ...@@ -177,7 +177,7 @@ void MKLDNNTester::checkBackwardWgts() {
<< parameters_[REF][i]->getName(); << parameters_[REF][i]->getName();
printVector(ref); printVector(ref);
double delta = compareVector(dnn, ref); double delta = compareVector(ref, dnn);
EXPECT_LE(fabs(delta), eps_); EXPECT_LE(fabs(delta), eps_);
} }
......
...@@ -271,20 +271,53 @@ TEST(MKLDNNLayer, BatchNormLayer) { ...@@ -271,20 +271,53 @@ TEST(MKLDNNLayer, BatchNormLayer) {
testBatchNormLayer({16, 32, 16, 16}); testBatchNormLayer({16, 32, 16, 16});
} }
struct testActDesc { struct testImageDesc {
int bs, ic, ih, iw; int bs, ic, ih, iw;
}; };
static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) { static void getAddtoConfig(TestConfig& cfg,
const testImageDesc& pm,
const size_t nInputs = 1) {
cfg.biasSize = 0; cfg.biasSize = 0;
cfg.layerConfig.set_type("addto"); cfg.layerConfig.set_type("addto");
size_t layerSize = pm.ic * pm.ih * pm.iw; size_t layerSize = pm.ic * pm.ih * pm.iw;
cfg.layerConfig.set_size(layerSize); cfg.layerConfig.set_size(layerSize);
cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0}); cfg.layerConfig.set_active_type("relu");
cfg.layerConfig.add_inputs(); for (size_t i = 0; i < nInputs; ++i) {
std::stringstream ss;
ss << "layer_" << i;
cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
LayerInputConfig* input = cfg.layerConfig.add_inputs();
ImageConfig* img_conf = input->mutable_image_conf();
img_conf->set_channels(pm.ic);
img_conf->set_img_size_y(pm.ih);
img_conf->set_img_size(pm.iw);
}
}
void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
CHECK_GE(nInputs, 1);
TestConfig dnnConfig;
getAddtoConfig(dnnConfig, pm, nInputs);
dnnConfig.layerConfig.set_type("mkldnn_addto");
// TODO(TJ): test with bias
for (auto withBias : {false}) {
if (withBias) {
dnnConfig.biasSize = pm.ic * pm.ih * pm.iw;
} else {
dnnConfig.biasSize = 0;
}
RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
}
}
TEST(MKLDNNLayer, AddtoLayer) {
testAddtoLayer({16, 5, 14, 14}, 1);
testAddtoLayer({8, 10, 8, 8}, 2);
testAddtoLayer({4, 12, 1, 1}, 3);
} }
void testActivation(std::string actType, const testActDesc& pm) { void testActivation(std::string actType, const testImageDesc& pm) {
// TODO(TJ): remove me when paddle support elu activation // TODO(TJ): remove me when paddle support elu activation
if (actType == "mkldnn_elu") { if (actType == "mkldnn_elu") {
return; return;
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include "Matrix.h" #include "Matrix.h"
#include "hl_gpu.h" #include "hl_gpu.h"
#include "hl_matrix.h"
#include "hl_table_apply.h" #include "hl_table_apply.h"
#include "paddle/utils/Flags.h" #include "paddle/utils/Flags.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
...@@ -99,6 +100,19 @@ MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) { ...@@ -99,6 +100,19 @@ MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
return mat; return mat;
} }
template <>
std::shared_ptr<VectorT<int>> VectorT<real>::castToInt() {
std::shared_ptr<VectorT<int>> ret = IVector::create(this->getSize(), useGpu_);
if (useGpu_) {
hl_vector_cast2int(ret->getData(), this->getData(), this->getSize());
} else {
for (size_t i = 0; i < getSize(); ++i) {
ret->getData()[i] = int(this->getData()[i]);
}
}
return ret;
}
template <class T> template <class T>
GpuVectorT<T>::GpuVectorT(size_t size) GpuVectorT<T>::GpuVectorT(size_t size)
: VectorT<T>(size, : VectorT<T>(size,
......
...@@ -162,6 +162,13 @@ public: ...@@ -162,6 +162,13 @@ public:
*/ */
std::shared_ptr<Matrix> toOneHotSparseMatrix(size_t idRange, bool useGpu); std::shared_ptr<Matrix> toOneHotSparseMatrix(size_t idRange, bool useGpu);
/**
* @brief cast vector of "real" elements to "int" elements.
*
* @note: float -> int must be casted, or you'll get wrong data.
*/
std::shared_ptr<VectorT<int>> castToInt();
/** /**
* This function will crash if the size of src and dest is different. * This function will crash if the size of src and dest is different.
*/ */
......
...@@ -62,6 +62,11 @@ function(op_library TARGET) ...@@ -62,6 +62,11 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(pool2d);\n") file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
endif() endif()
if ("${TARGET}" STREQUAL "compare_op")
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
endif()
# pool_with_index_op contains several operators # pool_with_index_op contains several operators
if ("${TARGET}" STREQUAL "pool_with_index_op") if ("${TARGET}" STREQUAL "pool_with_index_op")
set(pybind_flag 1) set(pybind_flag 1)
...@@ -69,6 +74,20 @@ function(op_library TARGET) ...@@ -69,6 +74,20 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n") file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
endif() endif()
# conv_op contains several operators
if ("${TARGET}" STREQUAL "conv_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
endif()
# conv_transpose_op contains several operators
if ("${TARGET}" STREQUAL "conv_transpose_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
endif()
# pool_cudnn_op contains several operators # pool_cudnn_op contains several operators
if ("${TARGET}" STREQUAL "pool_cudnn_op") if ("${TARGET}" STREQUAL "pool_cudnn_op")
set(pybind_flag 1) set(pybind_flag 1)
...@@ -96,7 +115,7 @@ function(op_library TARGET) ...@@ -96,7 +115,7 @@ function(op_library TARGET)
# It's enough to just adding one operator to pybind # It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n") file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
endif() endif()
# reduce_op contains several operators # reduce_op contains several operators
if ("${TARGET}" STREQUAL "reduce_op") if ("${TARGET}" STREQUAL "reduce_op")
set(pybind_flag 1) set(pybind_flag 1)
...@@ -104,6 +123,11 @@ function(op_library TARGET) ...@@ -104,6 +123,11 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n") file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
endif() endif()
if ("${TARGET}" STREQUAL "tensor_array_read_write_op")
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n")
endif()
# pybind USE_NO_KERNEL_OP # pybind USE_NO_KERNEL_OP
# HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
file(READ ${TARGET}.cc TARGET_CONTENT) file(READ ${TARGET}.cc TARGET_CONTENT)
...@@ -139,24 +163,42 @@ set(DEPS_OPS ...@@ -139,24 +163,42 @@ set(DEPS_OPS
sum_op sum_op
pool_op pool_op
pool_with_index_op pool_with_index_op
conv_op
lstm_op
conv_transpose_op
nccl_op nccl_op
sequence_conv_op sequence_conv_op
lstm_op) sequence_pool_op
lod_rank_table_op
lod_tensor_to_array_op
array_to_lod_tensor_op
lstm_op
tensor_array_read_write_op
gru_op)
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
op_library(cross_entropy_op DEPS cross_entropy) op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(conv_op DEPS vol2col)
op_library(sum_op DEPS net_op selected_rows_functor) op_library(sum_op DEPS net_op selected_rows_functor)
op_library(pool_op DEPS pooling) op_library(pool_op DEPS pooling)
op_library(pool_with_index_op DEPS pooling) op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
if(WITH_GPU) if(WITH_GPU)
op_library(nccl_op DEPS nccl_common) op_library(nccl_op DEPS nccl_common)
endif() endif()
op_library(sequence_conv_op DEPS context_project) op_library(sequence_conv_op DEPS context_project)
op_library(sequence_pool_op DEPS sequence_pooling)
op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(conv_transpose_op DEPS vol2col)
op_library(gru_op DEPS sequence2batch gru_compute)
op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
DEPS net_op tensor_array) DEPS net_op tensor_array)
op_library(recurrent_op SRCS recurrent_op.cc DEPS executor) op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
op_library(${src}) op_library(${src})
......
...@@ -33,7 +33,7 @@ class AccuracyOp : public framework::OperatorWithKernel { ...@@ -33,7 +33,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
auto inference_dim = ctx->GetInputDim("Out"); auto inference_dim = ctx->GetInputDim("Out");
auto label_dim = ctx->GetInputDim("Label"); auto label_dim = ctx->GetInputDim("Label");
// Assume indices has same shape with infernece, because // Assume indices has same shape as inference, because
// it's the output of topk. // it's the output of topk.
PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2."); PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
...@@ -47,10 +47,11 @@ class AccuracyOp : public framework::OperatorWithKernel { ...@@ -47,10 +47,11 @@ class AccuracyOp : public framework::OperatorWithKernel {
} }
protected: protected:
// IndicateDataType framework::OpKernelType GetKernelType(
framework::DataType IndicateDataType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::ToDataType(ctx.Input<Tensor>("Out")->type()); return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
ctx.device_context());
} }
}; };
...@@ -60,20 +61,24 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -60,20 +61,24 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker *op_checker) framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
// TODO(typhoonzero): support both inference value and indices. // TODO(typhoonzero): support both inference value and indices.
AddInput("Out", "topk (inferences) the network output"); AddInput("Out", "The network output of topk (inferences)");
AddInput("Indices", "topk (indices) the network output"); AddInput("Indices", "The the network output of topk (indices)");
AddInput("Label", "Label of the training data"); AddInput("Label", "Label of the training data");
// TODO(typhoonzero): AddInput("Weight", ... // TODO(typhoonzero): AddInput("Weight", ...
AddOutput("Accuracy", "The accuracy of current batch"); AddOutput("Accuracy", "The accuracy of current batch");
AddComment(R"DOC( AddComment(R"DOC(
Accuracy. It will print accuracy rate for classification. Accuracy Operator.
The accuracy is:
.. math:: It will print accuracy rate for classification.
accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples}) The accuracy is calculated as follows:
$$accuracy = \frac{NumOfCorrectPredicts}{NumOfAllSamples}$$
Both the input Out and Label can carry the LoD (Level of Details)
information, or not. But the output only shares the LoD information
with the input Out(Inference).
Both the input `Out` and `Label` can carry the LoD (Level of Details)
information, or not. But the output only shares the LoD with input `Inference`.
)DOC"); )DOC");
} }
}; };
......
...@@ -72,11 +72,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -72,11 +72,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
} }
AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<< AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
1, PADDLE_CUDA_NUM_THREADS, 0, 1, PADDLE_CUDA_NUM_THREADS, 0, ctx.cuda_device_context().stream()>>>(
reinterpret_cast<const platform::CUDADeviceContext&>( num_samples, infer_width, indices_data, label_data, accuracy_data);
ctx.device_context())
.stream()>>>(num_samples, infer_width, indices_data, label_data,
accuracy_data);
} }
}; };
......
...@@ -43,7 +43,12 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -43,7 +43,12 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Sigmoid operator"); AddInput("X", "Input of Sigmoid operator");
AddOutput("Y", "Output of Sigmoid operator"); AddOutput("Y", "Output of Sigmoid operator");
AddComment("Sigmoid activation operator, sigmoid = 1 / (1 + exp(-x))"); AddComment(R"DOC(
Sigmoid Activation Operator.
$y = 1 / (1 + e^{-x})$
)DOC");
} }
}; };
...@@ -54,8 +59,12 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -54,8 +59,12 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of LogSigmoid operator"); AddInput("X", "Input of LogSigmoid operator");
AddOutput("Y", "Output of LogSigmoid operator"); AddOutput("Y", "Output of LogSigmoid operator");
AddComment( AddComment(R"DOC(
"Logsigmoid activation operator, logsigmoid = log (1 / (1 + exp(-x)))"); Logsigmoid Activation Operator.
$y = \log(1 / (1 + e^{-x}))$
)DOC");
} }
}; };
...@@ -65,7 +74,12 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -65,7 +74,12 @@ class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Exp operator"); AddInput("X", "Input of Exp operator");
AddOutput("Y", "Output of Exp operator"); AddOutput("Y", "Output of Exp operator");
AddComment("Exp activation operator, exp(x) = e^x"); AddComment(R"DOC(
Exp Activation Operator.
$y = e^x$
)DOC");
} }
}; };
...@@ -75,7 +89,12 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -75,7 +89,12 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Relu operator"); AddInput("X", "Input of Relu operator");
AddOutput("Y", "Output of Relu operator"); AddOutput("Y", "Output of Relu operator");
AddComment("Relu activation operator, relu(x) = max(x, 0)"); AddComment(R"DOC(
Relu Activation Operator.
$y = \max(x, 0)$
)DOC");
} }
}; };
...@@ -87,11 +106,14 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -87,11 +106,14 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of LeakyRelu operator"); AddInput("X", "Input of LeakyRelu operator");
AddOutput("Y", "Output of LeakyRelu operator"); AddOutput("Y", "Output of LeakyRelu operator");
AddComment(
"LeakyRelu activation operator, "
"leaky_relu = max(x, alpha * x)");
AddAttr<AttrType>("alpha", "The small negative slope") AddAttr<AttrType>("alpha", "The small negative slope")
.SetDefault(static_cast<AttrType>(0.02f)); .SetDefault(static_cast<AttrType>(0.02f));
AddComment(R"DOC(
LeakyRelu Activation Operator.
$y = \max(x, \alpha * x)$
)DOC");
} }
}; };
...@@ -103,12 +125,20 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -103,12 +125,20 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Softshrink operator"); AddInput("X", "Input of Softshrink operator");
AddOutput("Y", "Output of Softshrink operator"); AddOutput("Y", "Output of Softshrink operator");
AddComment(
"Softshrink activation operator, "
"softshrink = x - lambda, if x > lambda;"
" x + lambda, if x < lambda; 0 otherwise");
AddAttr<AttrType>("lambda", "non-negative offset") AddAttr<AttrType>("lambda", "non-negative offset")
.SetDefault(static_cast<AttrType>(0.5f)); .SetDefault(static_cast<AttrType>(0.5f));
AddComment(R"DOC(
Softshrink Activation Operator.
$$
y = \begin{cases}
x - \lambda, \text{if } x > \lambda \\
x + \lambda, \text{if } x < -\lambda \\
0, \text{otherwise}
\end{cases}
$$
)DOC");
} }
}; };
...@@ -118,9 +148,12 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -118,9 +148,12 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Tanh operator"); AddInput("X", "Input of Tanh operator");
AddOutput("Y", "Output of Tanh operator"); AddOutput("Y", "Output of Tanh operator");
AddComment( AddComment(R"DOC(
"Tanh activation operator, tanh = (exp(x) - exp(-x)) / (exp(x) + " Tanh Activation Operator.
"exp(-x))");
$$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
)DOC");
} }
}; };
...@@ -131,7 +164,12 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -131,7 +164,12 @@ class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of TanhShrink operator"); AddInput("X", "Input of TanhShrink operator");
AddOutput("Y", "Output of TanhShrink operator"); AddOutput("Y", "Output of TanhShrink operator");
AddComment("TanhShrink activation operator, tanhshrink(x) = x - tanh(x)"); AddComment(R"DOC(
TanhShrink Activation Operator.
$$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
)DOC");
} }
}; };
...@@ -143,13 +181,20 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -143,13 +181,20 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of HardShrink operator"); AddInput("X", "Input of HardShrink operator");
AddOutput("Y", "Output of HardShrink operator"); AddOutput("Y", "Output of HardShrink operator");
AddComment(
"HardShrink activation operator, "
"hard_shrink(x) = x if x > lambda"
"hard_shrink(x) = x if x < -lambda"
"hard_shrink(x) = 0 otherwise");
AddAttr<AttrType>("threshold", "The value of threshold for HardShrink") AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
.SetDefault(static_cast<AttrType>(0.5)); .SetDefault(static_cast<AttrType>(0.5));
AddComment(R"DOC(
HardShrink Activation Operator.
$$
y = \begin{cases}
x, \text{if } x > \lambda \\
x, \text{if } x < -\lambda \\
0, \text{otherwise}
\end{cases}
$$
)DOC");
} }
}; };
...@@ -159,7 +204,12 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -159,7 +204,12 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Sqrt operator"); AddInput("X", "Input of Sqrt operator");
AddOutput("Y", "Output of Sqrt operator"); AddOutput("Y", "Output of Sqrt operator");
AddComment("Sqrt activation operator, sqrt(x) = x^(1/2)"); AddComment(R"DOC(
Sqrt Activation Operator.
$y = \sqrt{x}$
)DOC");
} }
}; };
...@@ -169,7 +219,12 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -169,7 +219,12 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Abs operator"); AddInput("X", "Input of Abs operator");
AddOutput("Y", "Output of Abs operator"); AddOutput("Y", "Output of Abs operator");
AddComment("Abs activation operator, abs(x) = |x|"); AddComment(R"DOC(
Abs Activation Operator.
$y = |x|$
)DOC");
} }
}; };
...@@ -180,7 +235,12 @@ class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -180,7 +235,12 @@ class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Reciprocal operator"); AddInput("X", "Input of Reciprocal operator");
AddOutput("Y", "Output of Reciprocal operator"); AddOutput("Y", "Output of Reciprocal operator");
AddComment("Reciprocal activation operator, reciprocal(x) = 1 / x"); AddComment(R"DOC(
Reciprocal Activation Operator.
$$y = \frac{1}{x}$$
)DOC");
} }
}; };
...@@ -190,7 +250,14 @@ class LogOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -190,7 +250,14 @@ class LogOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Log operator"); AddInput("X", "Input of Log operator");
AddOutput("Y", "Output of Log operator"); AddOutput("Y", "Output of Log operator");
AddComment("Log activation operator, log(x) = natural logarithm of x"); AddComment(R"DOC(
Log Activation Operator.
$y = \ln(x)$
Natural logarithm of x.
)DOC");
} }
}; };
...@@ -200,7 +267,12 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -200,7 +267,12 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Square operator"); AddInput("X", "Input of Square operator");
AddOutput("Y", "Output of Square operator"); AddOutput("Y", "Output of Square operator");
AddComment("Square activation operator, square(x) = x^2"); AddComment(R"DOC(
Square Activation Operator.
$y = x^2$
)DOC");
} }
}; };
...@@ -211,7 +283,12 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -211,7 +283,12 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Softplus operator"); AddInput("X", "Input of Softplus operator");
AddOutput("Y", "Output of Softplus operator"); AddOutput("Y", "Output of Softplus operator");
AddComment("Softplus activation operator, softplus(x) = log(1 + exp(x))"); AddComment(R"DOC(
Softplus Activation Operator.
$y = \ln(1 + e^{x})$
)DOC");
} }
}; };
...@@ -222,7 +299,12 @@ class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -222,7 +299,12 @@ class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Softsign operator"); AddInput("X", "Input of Softsign operator");
AddOutput("Y", "Output of Softsign operator"); AddOutput("Y", "Output of Softsign operator");
AddComment("Softsign activation operator, softsign(x) = x / (1 + |x|)"); AddComment(R"DOC(
Softsign Activation Operator.
$$y = \frac{x}{1 + |x|}$$
)DOC");
} }
}; };
...@@ -233,11 +315,16 @@ class BReluOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -233,11 +315,16 @@ class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of BRelu operator"); AddInput("X", "Input of BRelu operator");
AddOutput("Y", "Output of BRelu operator"); AddOutput("Y", "Output of BRelu operator");
AddComment("BRelu activation operator, brelu = max(min(x, t_min), t_max)");
AddAttr<AttrType>("t_min", "The min marginal value of BRelu") AddAttr<AttrType>("t_min", "The min marginal value of BRelu")
.SetDefault(static_cast<AttrType>(0)); .SetDefault(static_cast<AttrType>(0));
AddAttr<AttrType>("t_max", "The max marginal value of BRelu") AddAttr<AttrType>("t_max", "The max marginal value of BRelu")
.SetDefault(static_cast<AttrType>(24)); .SetDefault(static_cast<AttrType>(24));
AddComment(R"DOC(
BRelu Activation Operator.
$y = \max(\min(x, t_{min}), t_{max})$
)DOC");
} }
}; };
...@@ -249,11 +336,14 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -249,11 +336,14 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of SoftRelu operator"); AddInput("X", "Input of SoftRelu operator");
AddOutput("Y", "Output of SoftRelu operator"); AddOutput("Y", "Output of SoftRelu operator");
AddComment(
"SoftRelu activation operator, soft_relu = log(1 + exp(max(min(x, "
"threshold), threshold)))");
AddAttr<AttrType>("threshold", "The threshold value of SoftRelu") AddAttr<AttrType>("threshold", "The threshold value of SoftRelu")
.SetDefault(static_cast<AttrType>(40)); .SetDefault(static_cast<AttrType>(40));
AddComment(R"DOC(
SoftRelu Activation Operator.
$y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
)DOC");
} }
}; };
...@@ -262,19 +352,19 @@ class ELUOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -262,19 +352,19 @@ class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", AddInput("X", "Input of ELU operator");
"(Tensor) The input of ELU operator, it shouldn't be empty. Input " AddOutput("Y", "Output of ELU operator");
"is flattened and treated as a 1D array."); AddAttr<AttrType>("alpha", "The alpha value of ELU")
AddOutput("Y", .SetDefault(static_cast<AttrType>(1.0f));
"(Tensor) The output of ELU operator. It has the same shape as "
"the input.");
AddAttr<AttrType>(
"alpha", "(float, default 1.0) Alpha value in the elu formulation.")
.SetDefault(static_cast<AttrType>(1.));
AddComment(R"DOC( AddComment(R"DOC(
ELU activation operator. It applies this element-wise computation on ELU Activation Operator.
the input: f(x) = max(0, x) + min(0, alpha * (exp(x) - 1)).
Check .. _Link: https://arxiv.org/abs/1511.07289 for more details.)DOC"); Applies the following element-wise computation on the input according to
https://arxiv.org/abs/1511.07289.
$y = \max(0, x) + \min(0, \alpha * (e^x - 1))$
)DOC");
} }
}; };
...@@ -285,9 +375,14 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -285,9 +375,14 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Relu6 operator"); AddInput("X", "Input of Relu6 operator");
AddOutput("Y", "Output of Relu6 operator"); AddOutput("Y", "Output of Relu6 operator");
AddComment("Relu6 activation operator, relu6 = min(max(0, x), 6)");
AddAttr<AttrType>("threshold", "The threshold value of Relu6") AddAttr<AttrType>("threshold", "The threshold value of Relu6")
.SetDefault(static_cast<AttrType>(6)); .SetDefault(static_cast<AttrType>(6));
AddComment(R"DOC(
Relu6 Activation Operator.
$y = \min(\max(0, x), 6)$
)DOC");
} }
}; };
...@@ -298,9 +393,14 @@ class PowOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -298,9 +393,14 @@ class PowOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Pow operator"); AddInput("X", "Input of Pow operator");
AddOutput("Y", "Output of Pow operator"); AddOutput("Y", "Output of Pow operator");
AddComment("Pow activation operator, pow(x, factor) = x^factor");
AddAttr<AttrType>("factor", "The exponential factor of Pow") AddAttr<AttrType>("factor", "The exponential factor of Pow")
.SetDefault(static_cast<AttrType>(1)); .SetDefault(static_cast<AttrType>(1));
AddComment(R"DOC(
Pow Activation Operator.
$y = x^{factor}$
)DOC");
} }
}; };
...@@ -311,11 +411,16 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -311,11 +411,16 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of STanh operator"); AddInput("X", "Input of STanh operator");
AddOutput("Y", "Output of STanh operator"); AddOutput("Y", "Output of STanh operator");
AddComment("STanh activation operator, stanh = b * tanh(a * x)");
AddAttr<AttrType>("scale_a", "The scale parameter of a for the input") AddAttr<AttrType>("scale_a", "The scale parameter of a for the input")
.SetDefault(static_cast<AttrType>(2 / 3)); .SetDefault(static_cast<AttrType>(2 / 3));
AddAttr<AttrType>("scale_b", "The scale parameter of b for the input") AddAttr<AttrType>("scale_b", "The scale parameter of b for the input")
.SetDefault(static_cast<AttrType>(1.7159)); .SetDefault(static_cast<AttrType>(1.7159));
AddComment(R"DOC(
STanh Activation Operator.
$$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
)DOC");
} }
}; };
...@@ -327,12 +432,19 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -327,12 +432,19 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of ThresholdedRelu operator"); AddInput("X", "Input of ThresholdedRelu operator");
AddOutput("Y", "Output of ThresholdedRelu operator"); AddOutput("Y", "Output of ThresholdedRelu operator");
AddComment(
"ThresholdedRelu activation operator, "
"thresholded_relu = x for x > threshold, "
"thresholded_relu = 0 otherwise.");
AddAttr<AttrType>("threshold", "The threshold location of activation") AddAttr<AttrType>("threshold", "The threshold location of activation")
.SetDefault(static_cast<AttrType>(1.0)); .SetDefault(static_cast<AttrType>(1.0));
AddComment(R"DOC(
ThresholdedRelu Activation Operator.
$$
y = \begin{cases}
x, \text{if } x > threshold \\
0, \text{otherwise}
\end{cases}
$$
)DOC");
} }
}; };
...@@ -344,27 +456,23 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -344,27 +456,23 @@ class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of HardSigmoid operator"); AddInput("X", "Input of HardSigmoid operator");
AddOutput("Y", "Output of HardSigmoid operator"); AddOutput("Y", "Output of HardSigmoid operator");
AddAttr<AttrType>("slope", "Slope for linear approximation of sigmoid")
.SetDefault(static_cast<AttrType>(0.2));
AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
.SetDefault(static_cast<AttrType>(0.5));
AddComment(R"DOC( AddComment(R"DOC(
Hard Sigmoid activation operator. HardSigmoid Activation Operator.
Segment-wise linear approximation of sigmoid[1]. Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
This is much faster than sigmoid. which is much faster than sigmoid.
hard_sigmoid = max(0, min(1, slope * x + shift)) $y = \max(0, \min(1, slope * x + shift))$
The slope should be positive. The offset can be either positive or negative. The slope should be positive. The offset can be either positive or negative.
The default slope and shift are set from [1]. The default slope and shift are set according to the above reference.
It is recommended to use the defaults for this activation. It is recommended to use the defaults for this activation.
References: )DOC");
[1] Noisy Activation Functions
(https://arxiv.org/abs/1603.00391)
)DOC");
AddAttr<AttrType>("slope", "Slope for linear approximation of sigmoid")
.SetDefault(static_cast<AttrType>(0.2));
AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
.SetDefault(static_cast<AttrType>(0.5));
} }
}; };
......
...@@ -232,7 +232,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> { ...@@ -232,7 +232,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
} }
}; };
// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < lambda; 0 // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
// otherwise // otherwise
template <typename T> template <typename T>
struct SoftShrinkFunctor : public BaseActivationFunctor<T> { struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
......
...@@ -64,16 +64,15 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -64,16 +64,15 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Param", "(Tensor) Input parameter"); AddInput("Param", "(Tensor) Input parameter");
AddInput("Grad", "(Tensor) Input gradient"); AddInput("Grad", "(Tensor) Input gradient");
AddInput("AvgSquaredGrad", AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
"(Tensor) Input expectation of squared gradient");
AddInput("AvgSquaredUpdate", AddInput("AvgSquaredUpdate",
"(Tensor) Input expectation of squared parameter updates"); "(Tensor) Input average of squared parameter updates");
AddOutput("ParamOut", "(Tensor) Output parameter"); AddOutput("ParamOut", "(Tensor) Output parameter");
AddOutput("AvgSquaredGradOut", AddOutput("AvgSquaredGradOut",
"(Tensor) Output expectation of squared gradient"); "(Tensor) Output average of squared gradient");
AddOutput("AvgSquaredUpdateOut", AddOutput("AvgSquaredUpdateOut",
"(Tensor) Output expectation of squared parameter updates"); "(Tensor) Output average of squared parameter updates");
AddAttr<float>("rho", AddAttr<float>("rho",
"(float, default 0.95) Exponential decay rate " "(float, default 0.95) Exponential decay rate "
...@@ -84,22 +83,21 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -84,22 +83,21 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
"numerical stability") "numerical stability")
.SetDefault(1.0e-6f); .SetDefault(1.0e-6f);
AddComment(R"DOC( AddComment(R"DOC(
Adadelta Updates Operator. Adadelta Optimizer.
This implements the Adadelta optimizer[1]. Adadelta is a per-dimension Adadelta optimizer is implemented as explained in:
adaptive learning rate method for gradient descent. https://arxiv.org/abs/1212.5701
Adadelta is a per-dimension adaptive learning rate method used
for gradient descent.
Adadelta updates: Adadelta updates are as follows:
avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * grad * grad $$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
param_update = - sqrt((avg_squared_update + epsilon) / paramUpdate = - $\sqrt{((avgSquaredUpdate + \epsilon) /
(avg_squared_grad_out + epsilon)) * grad (avgSquaredGrad_out + \epsilon))}$ * grad \break
avg_squared_update_out = rho * avg_squared_update + (1 - rho) * param_update**2 avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
param_out = param + param_update {(paramUpdate)}^2 \break
paramOut = param + paramUpdate$$
References:
[1] ADADELTA: An Adaptive Learning Rate Method
https://arxiv.org/abs/1212.5701
)DOC"); )DOC");
} }
......
...@@ -73,12 +73,16 @@ class AdagradOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -73,12 +73,16 @@ class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
Adaptive Gradient Algorithm (Adagrad). Adaptive Gradient Algorithm (Adagrad).
moment_out = moment + grad * grad The update is done as follows:
param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
$$momentOut = moment + grad * grad \break
paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break
$$
The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
does not have the epsilon attribute. It is added here for numerical stability does not have the epsilon attribute. It is added here in our implementation
by avoiding division by zero. as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
for numerical stability to avoid the division by zero error.
)DOC"); )DOC");
} }
......
...@@ -51,8 +51,8 @@ class AdamOp : public framework::OperatorWithKernel { ...@@ -51,8 +51,8 @@ class AdamOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
"Beta1 power accumulator should have 1 dimension"); "Beta1 power accumulator should have 1 dimension");
auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
"Beta1 power accumulator should have 1 dimension"); "Beta2 power accumulator should have 1 dimension");
auto param_dims = ctx->GetInputDim("Param"); auto param_dims = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -60,10 +60,10 @@ class AdamOp : public framework::OperatorWithKernel { ...@@ -60,10 +60,10 @@ class AdamOp : public framework::OperatorWithKernel {
"Param and Grad input of AdamOp should have same dimension"); "Param and Grad input of AdamOp should have same dimension");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment1"), param_dims, ctx->GetInputDim("Moment1"),
"Param and Moment input of AdamOp should have same dimension"); "Param and Moment1 input of AdamOp should have same dimension");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment2"), param_dims, ctx->GetInputDim("Moment2"),
"Param and InfNorm input of AdamOp should have same dimension"); "Param and Moment2 input of AdamOp should have same dimension");
ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("ParamOut", param_dims);
ctx->SetOutputDim("Moment1Out", param_dims); ctx->SetOutputDim("Moment1Out", param_dims);
...@@ -103,23 +103,20 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -103,23 +103,20 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(1.0e-8f); .SetDefault(1.0e-8f);
AddComment(R"DOC( AddComment(R"DOC(
Adam Updates Operator. Adam Optimizer.
This implements the Adam optimizer from Section 2 of the Adam This implements the Adam optimizer from Section 2 of the Adam
paper[1]. Adam is a first-order gradient-based optimization paper : https://arxiv.org/abs/1412.6980.
method based on adaptive estimates of lower-order moments. Adam is a first-order gradient-based optimization method based on
adaptive estimates of lower-order moments.
Adam updates: Adam updates:
moment1_out = beta1 * moment1 + (1 − beta1) * grad $$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break
moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break
learning_rate_t = learning_rate_t * learningRate = learningRate *
sqrt(1 - beta2_pow) / (1 - beta1_pow) $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break
param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon) paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
References:
[1] Adam: A Method for Stochastic Optimization
(https://arxiv.org/abs/1412.6980)
)DOC"); )DOC");
} }
......
...@@ -99,26 +99,22 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -99,26 +99,22 @@ class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
"Constant for numerical stability") "Constant for numerical stability")
.SetDefault(1.0e-8f); .SetDefault(1.0e-8f);
AddComment(R"DOC( AddComment(R"DOC(
Adamax Updates Operator. Adamax Optimizer.
This implements the Adamax optimizer from Section 7 of the Adam We implement the Adamax optimizer from Section 7 of the Adam
paper[1]. Adamax is a variant of the paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
Adam algorithm based on the infinity norm. Adam algorithm based on the infinity norm.
Adamax updates: Adamax updates:
moment_out = beta1 * moment + (1 - beta1) * grad $$momentOut = \beta_1 * moment + (1 - \beta_1) * grad \break
inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad)) infNormOut = max(\beta_2 * infNorm + \epsilon, |grad|) \break
learning_rate_t = learning_rate/(1 - beta1_pow) learningRate = learningRate /(1 - \beta_1_{pow}) \break
param_out = param - learning_rate_t * moment_out/inf_norm_out paramOut = param - learningRate * momentPut / infNormOut$$
The original paper does not have an epsilon attribute. The original paper does not have an epsilon attribute.
However, it is added here for numerical stability However, it is added here for numerical stability to prevent the
by preventing divide by 0. division by 0 error.
References:
[1] Adam: A Method for Stochastic Optimization
(https://arxiv.org/abs/1412.6980)
)DOC"); )DOC");
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <numeric>
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h"
#include "paddle/memory/memcpy.h"
namespace paddle {
namespace operators {
using LoD = framework::LoD;
class ArrayToLoDTensorOp : public framework::OperatorBase {
public:
ArrayToLoDTensorOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {
auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
auto &rank_table =
scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
auto *out =
scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
// Check dims, place and data type of input's elements and infer output's
// dim
PADDLE_ENFORCE(!x.empty(), "There's no element in the input array.");
int rank = x[0].dims().size();
platform::Place place = x[0].place();
std::type_index data_type = x[0].type();
framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank);
int64_t batch_size = x[0].dims()[0];
for (size_t i = 1; i < x.size(); ++i) {
PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims,
"The dimension of the %zu'th element in LoDTensorArray "
"differs from previous ones.",
i);
PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place),
"The place class of the %zu'th element in LoDTensorArray "
"differs from previous ones.",
i);
PADDLE_ENFORCE(x[i].type() == data_type,
"The date type of the %zu'th element in LoDTensorArray "
"differs from previous ones.",
i);
batch_size += x[i].dims()[0];
}
auto ins_dim_vec = framework::vectorize(ins_dims);
ins_dim_vec.insert(ins_dim_vec.begin(), batch_size);
framework::DDim out_dims = framework::make_ddim(ins_dim_vec);
out->Resize(out_dims);
out->mutable_data(place, data_type);
auto &table_items = rank_table.items();
std::vector<size_t> table_item_idx(table_items.size());
// table_item_idx = range(table_items_idx.size())
std::iota(table_item_idx.begin(), table_item_idx.end(), 0);
std::sort(table_item_idx.begin(), table_item_idx.end(),
[&](size_t a, size_t b) {
return table_items[a].index < table_items[b].index;
});
// Build LoDTensor `out`
framework::LoD *out_lod = out->mutable_lod();
out_lod->clear();
size_t out_offset = 0;
auto prefix_lod = rank_table.coarse_lod();
prefix_lod.emplace_back();
auto &cur_level_lod = prefix_lod.back();
cur_level_lod.push_back(0);
for (size_t idx : table_item_idx) {
cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) {
auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
x[x_idx].lod(), idx, idx + 1, 0);
auto &lod_length = lod_and_offset.first;
framework::AppendLoD(out_lod, lod_length);
size_t start_offset = lod_and_offset.second.first;
size_t end_offset = lod_and_offset.second.second;
VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
<< ", " << end_offset << "]";
// Copy data
PADDLE_ENFORCE_GE(end_offset, start_offset);
size_t len = end_offset - start_offset;
if (len == 0) {
continue;
}
out->Slice(out_offset, out_offset + len)
.CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx);
out_offset += len;
}
}
out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end());
}
};
class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
ArrayToLoDTensorOpProtoMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"(std::vector<LodTensor>) A vector of tensors that is going to "
"be casted to a big LoDTensor.");
AddInput("RankTable",
"(LoDRankTable) RankTable provides the coarse lod infomation to "
"build the output LoDTensor. See "
"'paddle/framework/lod_rank_table.h' for more details.");
AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
AddComment(
R"DOC(This Op build a big LoDTensor from a std::vector<LoDTensor>
and a LoDRankTable. It is supposed to be used in getting dynamic RNN's
outputs back to a normal LoDTensor. The std::vector<LoDTensor>
would be the output of RNN Op and the LoDRankTable would be build
with RNN's input.)DOC");
}
};
class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
PADDLE_ENFORCE(context->HasInput("X"),
"ArrayToLoDTensorOp must has input X.");
PADDLE_ENFORCE(context->HasInput("RankTable"),
"ArrayToLoDTensorOp must has input RankTable.");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp,
ops::ArrayToLoDTensorOpProtoMaker,
ops::ArrayToLoDTensorInferShape);
...@@ -23,11 +23,11 @@ class AucOp : public framework::OperatorWithKernel { ...@@ -23,11 +23,11 @@ class AucOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out must be initialized."); PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Indices"), PADDLE_ENFORCE(ctx->HasInput("Indices"),
"Input of Indices must be initialized."); "Input of Indices should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Label"), PADDLE_ENFORCE(ctx->HasInput("Label"),
"Input of Label must be initialized."); "Input of Label should not be null.");
auto inference_height = ctx->GetInputDim("Out")[0]; auto inference_height = ctx->GetInputDim("Out")[0];
auto label_height = ctx->GetInputDim("Label")[0]; auto label_height = ctx->GetInputDim("Label")[0];
...@@ -39,10 +39,11 @@ class AucOp : public framework::OperatorWithKernel { ...@@ -39,10 +39,11 @@ class AucOp : public framework::OperatorWithKernel {
} }
protected: protected:
// IndicateDataType framework::OpKernelType GetKernelType(
framework::DataType IndicateDataType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::ToDataType(ctx.Input<Tensor>("Out")->type()); return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
ctx.device_context());
} }
}; };
...@@ -52,20 +53,20 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -52,20 +53,20 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Out", AddInput("Out",
"A floating point 2D tensor, values are in the range [0, 1]." "A floating point 2D tensor, values are in the range [0, 1]."
"Each row is descend sorted. This input should be the" "Each row is sorted in descending order. This input should be the"
"output of topk." "output of topk."
"Typically, this tensor indicates the probability of each label"); "Typically, this tensor indicates the probability of each label");
AddInput("Indices", AddInput("Indices",
"An int 2D tensor, indicating the indices of original" "An int 2D tensor, indicating the indices of original"
"tensor before sort. Typically, this tensor indicates which label" "tensor before sorting. Typically, this tensor indicates which "
"the probability stands for."); "label the probability stands for.");
AddInput("Label", AddInput("Label",
"A 2D int tensor indicating the label of the training data." "A 2D int tensor indicating the label of the training data."
"The height is batch size and width is always 1."); "The height is batch size and width is always 1.");
// TODO(typhoonzero): support weight input // TODO(typhoonzero): support weight input
AddOutput("AUC", AddOutput("AUC",
"A scalar representing the " "A scalar representing the "
"current area-under-curve."); "current area-under-the-curve.");
AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.") AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
.SetDefault("ROC"); .SetDefault("ROC");
...@@ -74,19 +75,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -74,19 +75,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
" roc curve.") " roc curve.")
.SetDefault(200); .SetDefault(200);
AddComment( AddComment(R"DOC(
R"DOC(Computes the AUC according forward output and label. Area Under The Curve (AUC) Operator.
Best to use for binary classification evaluations.
This implementation computes the AUC according to forward output and label.
It is used very widely in binary classification evaluation. As a note:
If input label contains values other than 0 and 1, it will be cast If input label contains values other than 0 and 1, it will be cast
to bool. to bool. You can find the relevant definitions here:
You can find the definations here:
https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
Possible curves are: There are two types of possible curves:
- ROC: Receiver operating characteristic 1. ROC: Receiver operating characteristic
- PR: Precision Recall 2. PR: Precision Recall
)DOC"); )DOC");
} }
}; };
......
...@@ -51,6 +51,10 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -51,6 +51,10 @@ class BatchNormOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), ""); PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), ""); PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
const float epsilon = ctx->Attrs().Get<float>("epsilon");
PADDLE_ENFORCE_GE(epsilon, 0.0, "epsilon should be larger than 0");
PADDLE_ENFORCE_LE(epsilon, 0.001, "epsilon should not be too large");
// make sure Mean/MeanOut and Variance/VarianceOut share memory in Python // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0], PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
"Mean and MeanOut should share the same memory"); "Mean and MeanOut should share the same memory");
...@@ -66,7 +70,7 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -66,7 +70,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
: x_dims[x_dims.size() - 1]); : x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
"Input x must have 3 to 5 dimensions."); "Input X must have 3 to 5 dimensions.");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
...@@ -93,16 +97,16 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -93,16 +97,16 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "The input tensor"); AddInput("X", "The input tensor");
AddInput("Scale", AddInput("Scale",
"Scale is a 1-dimensional tensor of size C " "Scale is a 1-dimensional tensor of size C "
"to be applied to the output"); "that is applied to the output");
AddInput("Bias", AddInput("Bias",
"Bias is a 1-dimensional tensor of size C " "Bias is a 1-dimensional tensor of size C "
"to be applied to the output"); "that is applied to the output");
AddInput("Mean", AddInput("Mean",
"The global mean (for training) or the " "The global mean (for training) or "
"estimated mean (for testing)"); "estimated mean (for testing)");
AddInput("Variance", AddInput("Variance",
"The global variance (for training) " "The global variance (for training) "
"or the estimated Variance (for testing)"); "or estimated Variance (for testing)");
AddOutput("Y", "result after normalization"); AddOutput("Y", "result after normalization");
AddOutput("MeanOut", AddOutput("MeanOut",
"Share memory with Mean. " "Share memory with Mean. "
...@@ -119,10 +123,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -119,10 +123,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
"will apply to output when training") "will apply to output when training")
.AsIntermediate(); .AsIntermediate();
AddComment(R"DOC( AddComment(R"DOC(
https://arxiv.org/pdf/1502.03167.pdf Batch Normalization.
NHWC `[batch, in_height, in_width, in_channels]` Batch Norm has been implemented as discussed in the paper:
NCHW `[batch, in_channels, in_height, in_width]` https://arxiv.org/pdf/1502.03167.pdf
Can be used as a normalizer function for conv2d and fully_connected operations.
The required data format for this layer is one of the following:
1. NHWC `[batch, in_height, in_width, in_channels]`
2. NCHW `[batch, in_channels, in_height, in_width]`
)DOC"); )DOC");
} }
...@@ -295,9 +303,9 @@ class BatchNormGradOp : public framework::OperatorWithKernel { ...@@ -295,9 +303,9 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
} }
framework::DataType IndicateDataType( protected:
framework::OpKernelType GetKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
VLOG(3) << "IndicateDataType " << this->Type();
const auto *var = ctx.InputVar(framework::GradVarName("Y")); const auto *var = ctx.InputVar(framework::GradVarName("Y"));
if (var == nullptr) { if (var == nullptr) {
PADDLE_THROW("can't find Y@GRAD"); PADDLE_THROW("can't find Y@GRAD");
...@@ -311,7 +319,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel { ...@@ -311,7 +319,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
if (t == nullptr) { if (t == nullptr) {
PADDLE_THROW("can't find Y@GRAD"); PADDLE_THROW("can't find Y@GRAD");
} }
return framework::ToDataType(t->type()); return framework::OpKernelType(framework::ToDataType(t->type()),
ctx.device_context());
} }
}; };
......
...@@ -23,13 +23,17 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -23,13 +23,17 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
CastOpProtoMaker(framework::OpProto *proto, CastOpProtoMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker) framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "the input tensor of cast op"); AddInput("X", "The input tensor of cast op");
AddOutput("Out", "the output tensor of cast op"); AddOutput("Out", "The output tensor of cast op");
AddComment(R"DOC(Cast operator.
cast the input tensor to other data type.
)DOC");
AddAttr<int>("out_data_type", "output data type"); AddAttr<int>("out_data_type", "output data type");
AddAttr<int>("in_data_type", "input data type"); AddAttr<int>("in_data_type", "input data type");
AddComment(R"DOC(
Cast Operator.
This Operator casts the input tensor to another data type and
returns tha Output Tensor.
)DOC");
} }
}; };
......
...@@ -49,8 +49,11 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -49,8 +49,11 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<AttrType>( AddAttr<AttrType>(
"max", "(float)Maximum value, above which element is replaced by max"); "max", "(float)Maximum value, above which element is replaced by max");
AddComment(R"DOC( AddComment(R"DOC(
Clip operator limits the given input within an interval. The interval is Clip Operator.
The clip operator limits the value of given input within an interval. The interval is
specified with arguments 'min' and 'max'. specified with arguments 'min' and 'max'.
)DOC"); )DOC");
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/compare_op.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename OpComment>
class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
CompareOpProtoMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
OpComment comment;
AddInput("X",
string::Sprintf("(LoDTensor) the left hand operand of %s operator",
comment.type));
AddInput("Y", string::Sprintf(
"(LoDTensor) the right hand operand of %s operator",
comment.type));
AddOutput("Out", string::Sprintf(
"(LoDTensor) n-dim bool tensor. Each element is %s",
comment.equation));
AddComment(string::Sprintf(R"DOC(%s Operator
It operates element-wise on X and Y, and returns the Out. Each of them is a
N-dim tensor. X and Y could be any type. The each element of the Out tensor is
calculated by %s
)DOC",
comment.type, comment.equation));
}
};
template <typename OpComment>
class CompareOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
OpComment comment;
PADDLE_ENFORCE(context->HasInput("X"), "%s operator must has input X",
comment.type);
PADDLE_ENFORCE(context->HasInput("Y"), "%s operator must has input Y",
comment.type);
auto dim_x = context->GetInputDim("X");
auto dim_y = context->GetInputDim("Y");
PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
"The number of elements in X and Y should be same");
context->SetOutputDim("Out", context->GetInputDim("X"));
context->ShareLoD("X", "Out");
}
};
} // namespace operators
} // namespace paddle
#define REGISTER_LOGICAL_OP(op_type, _equation) \
struct _##op_type##Comment { \
static char type[]; \
static char equation[]; \
}; \
char _##op_type##Comment::type[]{#op_type}; \
char _##op_type##Comment::equation[]{_equation}; \
REGISTER_OP_WITH_KERNEL( \
op_type, ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
::paddle::framework::EmptyGradOpMaker);
REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
REGISTER_LOGICAL_OP(equal, "Out = X == Y");
REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/compare_op.h"
REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor);
REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <math.h>
#include <type_traits>
#include "paddle/framework/op_registry.h"
#include "paddle/platform/transform.h"
namespace paddle {
namespace operators {
template <typename T>
struct LessThanFunctor {
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
};
template <typename T>
struct EqualFunctor {
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T& a, const T& b) const {
if (std::is_floating_point<T>::value) {
// This branch will be optimized while compiling if T is integer. It is
// safe to cast a and b to double.
return fabs(static_cast<double>(a - b)) < 1e-8;
} else {
return (a == b);
}
}
};
template <typename Place, typename Functor>
class CompareOpKernel
: public framework::OpKernel<typename Functor::ELEM_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
using T = typename Functor::ELEM_TYPE;
auto* x = context.Input<framework::Tensor>("X");
auto* y = context.Input<framework::Tensor>("Y");
auto* out = context.Output<framework::Tensor>("Out");
Functor binary_func;
platform::Transform<Place> trans;
trans(context.device_context(), x->data<T>(), x->data<T>() + x->numel(),
y->data<T>(), out->mutable_data<bool>(context.GetPlace()),
binary_func);
}
};
} // namespace operators
} // namespace paddle
#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor) \
REGISTER_OP_##dev##_KERNEL( \
op_type, \
::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
functor<int>>, \
::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
functor<int64_t>>, \
::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
functor<float>>, \
::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \
functor<double>>);
...@@ -56,20 +56,24 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -56,20 +56,24 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
ConcatOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) ConcatOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "the input tensors of concat operator.").AsDuplicable(); AddInput("X", "Input tensors of concat operator.").AsDuplicable();
AddOutput("Out", "the output tensor of concat operator."); AddOutput("Out", "Output tensor of concat operator.");
AddComment(R"DOC( AddAttr<int>("axis",
Join the input tensors along with the axis. "The axis along which the input tensors will be concatenated.")
Examples:
Input[0] = [[1,2],[3,4]]
Input[1] = [[5,6]]
axis = 0
Output = [[1,2],
[3,4],
[5,6]]
)DOC");
AddAttr<int>("axis", "The axis which the inputs will be joined with.")
.SetDefault(0); .SetDefault(0);
AddComment(R"DOC(
Concat Operator.
Concatenate the input tensors along dimension axis.
Examples:
Input[0] = [[1,2],[3,4]]
Input[1] = [[5,6]]
axis = 0
Output = [[1,2],
[3,4],
[5,6]]
)DOC");
} }
}; };
......
...@@ -216,11 +216,12 @@ class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker { ...@@ -216,11 +216,12 @@ class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("IndexTensors", "Index Tensors contains indices for true/false"); AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
AddComment(R"DOC( AddComment(R"DOC(
Sample dependent Cond Operator: Sample Dependent Conditional Operator.
Given Cond[i] as a 1/0 vector to indicate true/false
The equation is: Given Cond[i] as a 1/0 vector to indicate true/false:
Out[i] = subnet_t[i], if Cond[i] == true Out[i] = subnet_true[i], if Cond[i] == true
Out[i] = subnet_t[i], if Cond[i] == false Out[i] = subnet_false[i], if Cond[i] == false
)DOC"); )DOC");
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/conv_transpose_op.h"
namespace paddle {
namespace operators {
class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
public:
CudnnConv2DTransposeOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: Conv2DTransposeOpMaker(proto, op_checker) {
AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
.SetDefault(std::vector<int>{1, 1});
AddAttr<int>("workspace_size_MB",
"workspace size for cudnn, in MB, "
"workspace is a section of GPU memory which will be "
"allocated/freed each time the operator runs, larger "
"workspace size can increase performance but also requires "
"better hardward. This size should be carefully setted.")
.SetDefault(4096);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp,
ops::CudnnConv2DTransposeOpMaker, conv2d_transpose_cudnn_grad,
ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
conv2d_transpose_cudnn,
ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
conv2d_transpose_cudnn_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/memory/memory.h"
#include "paddle/operators/conv_transpose_op.h"
#include "paddle/platform/assert.h"
#include "paddle/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using DataLayout = platform::DataLayout;
static constexpr size_t kConvCudnnWorkspaceLimitBytes = 1024 * 1024 * 1024;
template <typename T>
class CudnnConvTransposeOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
// cudnn v5 does not support dilations
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor input_desc;
ScopedTensorDescriptor output_desc;
ScopedFilterDescriptor filter_desc;
ScopedConvolutionDescriptor conv_desc;
DataLayout layout = DataLayout::kNCHW;
// N, M, H, W
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()));
// N, C, O_h, O_w
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
layout, framework::vectorize2int(output->dims()));
// M, C, K_h, K_w
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
layout, framework::vectorize2int(filter->dims()));
cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations);
// ------------------- cudnn conv workspace ---------------------
void* cudnn_workspace = nullptr;
size_t workspace_size_in_bytes; // final workspace to allocate.
size_t workspace_size_limit = kConvCudnnWorkspaceLimitBytes;
if (user_workspace_size > 0) {
workspace_size_limit = user_workspace_size * 1024 * 1024;
}
// ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionBwdDataAlgo_t algo;
auto handle = ctx.cuda_device_context().cudnn_handle();
// Get the algorithm
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
// dxDesc: Handle to the previously initialized output tensor
// descriptor.
cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
// get workspace size able to allocate
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes));
// Allocate on GPU memory
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv transpose forward ---------------------
T alpha = 1.0f, beta = 0.0f;
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc,
input_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
}
};
template <typename T>
class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
auto input = ctx.Input<Tensor>("Input");
auto filter = ctx.Input<Tensor>("Filter");
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
const T* input_data = input->data<T>();
const T* output_grad_data = output_grad->data<T>();
const T* filter_data = filter->data<T>();
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
// cudnn v5 does not support dilations
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor input_desc;
ScopedTensorDescriptor output_desc;
ScopedFilterDescriptor filter_desc;
ScopedConvolutionDescriptor conv_desc;
DataLayout layout = DataLayout::kNCHW;
// Input: (N, M, H, W)
cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
layout, framework::vectorize2int(input->dims()));
// Output: (N, C, O_H, O_W)
cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
layout, framework::vectorize2int(output_grad->dims()));
// Filter (M, C, K_H, K_W)
cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
layout, framework::vectorize2int(filter->dims()));
cudnnConvolutionDescriptor_t cudnn_conv_desc =
conv_desc.descriptor<T>(paddings, strides, dilations);
// ------------------- cudnn backward algorithm ---------------------
cudnnConvolutionFwdAlgo_t data_algo;
cudnnConvolutionBwdFilterAlgo_t filter_algo;
size_t bwd_filter_ws_size, fwd_ws_size;
size_t workspace_size_in_bytes = 0;
size_t workspace_size_limit = kConvCudnnWorkspaceLimitBytes;
if (user_workspace_size > 0) {
workspace_size_limit = user_workspace_size * 1024 * 1024;
}
auto handle = ctx.cuda_device_context().cudnn_handle();
if (input_grad) {
// choose backward algorithm for data
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &data_algo));
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_input_desc, data_algo, &fwd_ws_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
}
if (filter_grad) {
// choose backward algorithm for filter
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_filter_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &filter_algo));
// get workspace for backwards filter algorithm
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
workspace_size_in_bytes =
std::max(workspace_size_in_bytes, bwd_filter_ws_size);
}
// ------------------- cudnn conv workspace ---------------------
// Already on GPU
void* cudnn_workspace = nullptr;
platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data ---------------------
// FIXME(typhoonzero): template type T may not be the same as cudnn call.
T alpha = 1.0f, beta = 0.0f;
if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*input_grad);
t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
t.constant(static_cast<T>(0));
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_output_desc, output_grad_data,
cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data));
}
// ------------------- cudnn conv backward filter ---------------------
if (filter_grad) {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*filter_grad);
t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
t.constant(static_cast<T>(0));
// Gradient with respect to the filter
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
}
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn,
ops::CudnnConvTransposeOpKernel<float>);
REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad,
ops::CudnnConvTransposeGradOpKernel<float>);
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/conv2d_op.h" #include "paddle/operators/conv_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -29,7 +29,7 @@ class CudnnConvOpMaker : public Conv2DOpMaker { ...@@ -29,7 +29,7 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
"workspace is a section of GPU memory which will be " "workspace is a section of GPU memory which will be "
"allocated/freed each time the operator runs, larger " "allocated/freed each time the operator runs, larger "
"workspace size can increase performance but also requires " "workspace size can increase performance but also requires "
"better hardward. This size should be carefully setted.") "better hardware. This size should be chosen carefully.")
.SetDefault(4096); .SetDefault(4096);
} }
}; };
...@@ -38,10 +38,11 @@ class CudnnConvOpMaker : public Conv2DOpMaker { ...@@ -38,10 +38,11 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(conv_cudnn, ops::Conv2DOp, ops::CudnnConvOpMaker, conv_cudnn_grad, REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
ops::Conv2DOpGrad); ops::ConvOpGrad);
REGISTER_OP_CPU_KERNEL(
conv_cudnn, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>); REGISTER_OP_CPU_KERNEL(conv_cudnn,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv_cudnn_grad, conv_cudnn_grad,
ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>); ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/memory/memory.h" #include "paddle/memory/memory.h"
#include "paddle/operators/conv2d_op.h" #include "paddle/operators/conv_op.h"
#include "paddle/platform/assert.h" #include "paddle/platform/assert.h"
#include "paddle/platform/cudnn_helper.h" #include "paddle/platform/cudnn_helper.h"
...@@ -27,7 +27,6 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; ...@@ -27,7 +27,6 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using DataLayout = platform::DataLayout; using DataLayout = platform::DataLayout;
using CUDADeviceContext = platform::CUDADeviceContext;
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024; static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024;
......
...@@ -12,18 +12,18 @@ ...@@ -12,18 +12,18 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/conv2d_op.h" #include "paddle/operators/conv_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const { void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("Input"), PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of Conv2DOp should not be null."); "Input(Input) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Filter"), PADDLE_ENFORCE(ctx->HasInput("Filter"),
"Input(Filter) of Conv2DOp should not be null."); "Input(Filter) of ConvOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"), PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of Conv2DOp should not be null."); "Output(Output) of ConvOp should not be null.");
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter"); auto filter_dims = ctx->GetInputDim("Filter");
...@@ -33,8 +33,17 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -33,8 +33,17 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
int input_channels = in_dims[1]; int input_channels = in_dims[1];
int output_channels = filter_dims[0]; int output_channels = filter_dims[0];
PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D."); PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D."); "Conv intput should be 4-D or 5-D tensor.");
PADDLE_ENFORCE_EQ(
in_dims.size(), filter_dims.size(),
"Conv input dimension and filter dimension should be the same.");
PADDLE_ENFORCE(
in_dims.size() - strides.size() == 2U,
"Conv input dimension and strides dimension should be consistent.");
PADDLE_ENFORCE_EQ(
paddings.size(), strides.size(),
"Conv paddings dimension and Conv strides dimension should be the same.");
PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups, PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
"The number of input channels should be equal to filter " "The number of input channels should be equal to filter "
"channels * groups."); "channels * groups.");
...@@ -42,12 +51,12 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -42,12 +51,12 @@ void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
output_channels % groups, 0, output_channels % groups, 0,
"The number of output channels should be divided by groups."); "The number of output channels should be divided by groups.");
auto output_height = std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
OutputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]); for (size_t i = 0; i < paddings.size(); ++i) {
auto output_width = output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2],
OutputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]); paddings[i], strides[i]));
ctx->SetOutputDim("Output", }
{in_dims[0], filter_dims[0], output_height, output_width}); ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
} }
Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
...@@ -55,18 +64,19 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, ...@@ -55,18 +64,19 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput( AddInput(
"Input", "Input",
"The input tensor of convolution operator. " "(Tensor) The input tensor of convolution operator. "
"The format of input tensor is NCHW. Where N is batch size, C is the " "The format of input tensor is NCHW, where N is batch size, C is the "
"number of channels, H and W is the height and width of image."); "number of channels, H is the height of the feature, "
"and W is the width of the feature.");
AddInput("Filter", AddInput("Filter",
"The filter tensor of convolution operator." "(Tensor) The filter tensor of convolution operator. "
"The format of the filter tensor is MCHW, where M is the number of " "The format of the filter tensor is MCHW, where M is the number of "
"output image channels, C is the number of input image channels, " "output image channels, C is the number of input image channels, "
"H and W is height and width of filter. " "H is the height of the filter, and W is the width of the filter. "
"If the groups attribute is greater than 1, C equal the number of " "If the groups attribute is greater than 1, C equals the number of "
"input image channels divided by the groups."); "input image channels divided by the groups.");
AddOutput("Output", AddOutput("Output",
"The output tensor of convolution operator." "(Tensor) The output tensor of convolution operator. "
"The format of output tensor is also NCHW."); "The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>("strides", "strides of convolution operator.") AddAttr<std::vector<int>>("strides", "strides of convolution operator.")
.SetDefault({1, 1}); .SetDefault({1, 1});
...@@ -74,20 +84,100 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, ...@@ -74,20 +84,100 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
.SetDefault({0, 0}); .SetDefault({0, 0});
AddAttr<int>( AddAttr<int>(
"groups", "groups",
"group size of convolution operator. " "(int default:1), the group size of convolution operator. "
"Refer to grouped convolution in Alex Krizhevsky's paper: " "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
"when group=2, the first half of the filters are only connected to the " "when group=2, the first half of the filters is only connected to the "
"first half of the input channels, and the second half only connected " "first half of the input channels, while the second half of the filters "
"to the second half.") "is only connected to the second half of the input channels.")
.SetDefault(1);
AddComment(R"DOC(
Convolution Operator.
The convolution operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
size, C is the number of channels, H is the height of the feature, and W is
the width of the feature. Parameters(ksize, strides, paddings) are two elements.
These two elements represent height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
Input shape: (N, C_in, H_in, W_in)
Filter shape: (C_out, C_in, H_f, W_f)
Output:
Output shape: (N, C_out, H_out, W_out)
where
H_out = (H_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
W_out = (W_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
)DOC");
}
Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"Input",
"(Tensor) The input tensor of convolution operator. "
"The format of input tensor is NCDHW. Where N is batch size, C is the "
"number of channels, D is the depth of the feature, H is the height of "
"the feature, "
"and W is the width of the feature.");
AddInput("Filter",
"(Tensor) The filter tensor of convolution operator. "
"The format of the filter tensor is MCDHW, where M is the number of "
"output image channels, C is the number of input image channels, "
"D is the depth of the filter, H is the height of the filter, and W "
"is the width of the filter."
"If the groups attribute is greater than 1, C equals the number of "
"input image channels divided by the groups.");
AddOutput("Output",
"(Tensor) The output tensor of convolution operator."
"The format of output tensor is also NCDHW.");
AddAttr<std::vector<int>>(
"strides",
"(vector, default:{0, 0, 0}), the strides of convolution operator.")
.SetDefault({1, 1, 1});
AddAttr<std::vector<int>>(
"paddings",
"(vector, default:{0, 0, 0}), the paddings of convolution operator.")
.SetDefault({0, 0, 0});
AddAttr<int>(
"groups",
"(int default:1), the group size of convolution operator. "
"According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
"when group=2, the first half of the filters is only connected to the "
"first half of the input channels, while the second half of the filters "
"is only connected to the second half of the input channels.")
.SetDefault(1); .SetDefault(1);
AddComment(R"DOC( AddComment(R"DOC(
Convolution3D Operator.
The convolution operation calculates the output based on the input, filter The convolution operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape. parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
size, C is the number of channels,D is the depth of the feature, H is the height of
the feature, and W is the width of the feature. Parameters(ksize, strides, paddings)
are three elements. These three elements represent depth, height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
Input shape: (N, C_in, D_in, H_in, W_in)
Filter shape: (C_out, C_in, D_f, H_f, W_f)
Output:
Output shape: (N, C_out, D_out, H_out, W_out)
where
D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1;
)DOC"); )DOC");
} }
void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const { void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter"); auto filter_dims = ctx->GetInputDim("Filter");
if (ctx->HasOutput(framework::GradVarName("Input"))) { if (ctx->HasOutput(framework::GradVarName("Input"))) {
...@@ -102,10 +192,18 @@ void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const { ...@@ -102,10 +192,18 @@ void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(conv2d, ops::Conv2DOp, ops::Conv2DOpMaker, conv2d_grad, REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
ops::Conv2DOpGrad); ops::ConvOpGrad);
namespace ops = paddle::operators;
REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
ops::ConvOpGrad);
REGISTER_OP_CPU_KERNEL(conv2d,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv2d, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>); conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(conv3d,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>); conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
...@@ -12,11 +12,16 @@ ...@@ -12,11 +12,16 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/conv2d_op.h" #include "paddle/operators/conv_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(conv2d,
ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
conv2d, ops::GemmConv2DKernel<paddle::platform::GPUPlace, float>); conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(conv3d,
ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
conv2d_grad, ops::GemmConvGrad2DKernel<paddle::platform::GPUPlace, float>); conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/im2col.h" #include "paddle/operators/math/im2col.h"
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/vol2col.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -40,14 +41,20 @@ class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -40,14 +41,20 @@ class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
framework::OpAttrChecker* op_checker); framework::OpAttrChecker* op_checker);
}; };
class Conv2DOp : public framework::OperatorWithKernel { class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
public:
Conv3DOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker);
};
class ConvOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override; void InferShape(framework::InferShapeContext* ctx) const override;
}; };
class Conv2DOpGrad : public framework::OperatorWithKernel { class ConvOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -55,7 +62,7 @@ class Conv2DOpGrad : public framework::OperatorWithKernel { ...@@ -55,7 +62,7 @@ class Conv2DOpGrad : public framework::OperatorWithKernel {
}; };
template <typename Place, typename T> template <typename Place, typename T>
class GemmConv2DKernel : public framework::OpKernel<T> { class GemmConvKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input"); const Tensor* input = context.Input<Tensor>("Input");
...@@ -70,51 +77,78 @@ class GemmConv2DKernel : public framework::OpKernel<T> { ...@@ -70,51 +77,78 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
int groups = context.Attr<int>("groups"); int groups = context.Attr<int>("groups");
int batch_size = input->dims()[0]; const int batch_size = static_cast<int>(input->dims()[0]);
int input_channels = input->dims()[1];
int filter_height = filter.dims()[filter.dims().size() - 2]; // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
int filter_width = filter.dims()[filter.dims().size() - 1]; std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
int output_channels = output->dims()[1]; filter_shape_vec.erase(filter_shape_vec.begin(),
int output_height = output->dims()[2]; filter_shape_vec.begin() + 2);
int output_width = output->dims()[3];
// output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
paddle::operators::math::Im2ColFunctor< std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
paddle::operators::math::ColFormat::kCFO, Place, T> output_shape_vec.erase(output_shape_vec.begin(),
im2col; output_shape_vec.begin() + 2);
// use col_shape in the im2col calculation // use col_shape in the im2col calculation
framework::DDim col_shape = {input_channels / groups, filter_height, // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
filter_width, output_height, output_width}; // o_h, o_w}
std::vector<int64_t> col_shape_vec;
col_shape_vec.push_back(input->dims()[1] / groups);
col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
filter_shape_vec.end());
col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
output_shape_vec.end());
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation // use col_matrix_shape in the gemm calculation
framework::DDim col_matrix_shape = { // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
input_channels / groups * filter_height * filter_width, // o_h * o_w)
output_height * output_width}; framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
Tensor col; Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace()); col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col, // col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape // but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface. // to call the matrix multiplication interface.
Tensor col_matrix = col; Tensor col_matrix;
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
framework::DDim input_shape = {input->dims()[1], input->dims()[2], framework::DDim input_shape = framework::slice_ddim(
input->dims()[3]}; input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0], framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]}; filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape); filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {output_channels, framework::DDim output_matrix_shape = {
output_height * output_width}; output->dims()[1],
// convolution operator: im2col + gemm output->numel() / (output->dims()[0] * output->dims()[1])};
int in_step = input_channels / groups;
int out_step = output_channels / groups; // convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
// im2col
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0], strides[1],
paddings[0], paddings[0], paddings[1], paddings[1]); if (filter_shape_vec.size() == 2) {
// im2col
math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
} else if (filter_shape_vec.size() == 3) {
// vol2col
math::Vol2ColFunctor<Place, T> vol2col;
vol2col(context.device_context(), in_slice, col, strides[0],
strides[1], strides[2], paddings[0], paddings[1],
paddings[2]);
}
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
...@@ -127,7 +161,7 @@ class GemmConv2DKernel : public framework::OpKernel<T> { ...@@ -127,7 +161,7 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
}; };
template <typename Place, typename T> template <typename Place, typename T>
class GemmConvGrad2DKernel : public framework::OpKernel<T> { class GemmConvGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input"); const Tensor* input = context.Input<Tensor>("Input");
...@@ -137,64 +171,79 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> { ...@@ -137,64 +171,79 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
context.Output<Tensor>(framework::GradVarName("Input")); context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter")); context.Output<Tensor>(framework::GradVarName("Filter"));
// The filter and filter_grad will be reshaped in the calculations, // The filter and filter_grad will be reshaped in the calculations,
// so here use an assignment operation, // so here use an assignment operation,
// that avoids modifying the variable in the Scope. // that avoids modifying the variable in the Scope.
Tensor filter = *context.Input<Tensor>("Filter"); Tensor filter = *context.Input<Tensor>("Filter");
if (!input_grad && !filter_grad) return;
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
int groups = context.Attr<int>("groups"); int groups = context.Attr<int>("groups");
int batch_size = input->dims()[0]; const int batch_size = static_cast<int>(input->dims()[0]);
int input_channels = input->dims()[1];
int filter_height = filter.dims()[filter.dims().size() - 2]; // filter_shape_vec: {k_h, k_w} or {k_d, k_h, k_w}
int filter_width = filter.dims()[filter.dims().size() - 1]; std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
int output_channels = output_grad->dims()[1]; filter_shape_vec.erase(filter_shape_vec.begin(),
int output_height = output_grad->dims()[2]; filter_shape_vec.begin() + 2);
int output_width = output_grad->dims()[3];
// output_shape_vec: {o_h, o_w} or {o_d, o_h, o_w}
paddle::operators::math::Col2ImFunctor< std::vector<int64_t> output_shape_vec(
paddle::operators::math::ColFormat::kCFO, Place, T> framework::vectorize(output_grad->dims()));
col2im; output_shape_vec.erase(output_shape_vec.begin(),
paddle::operators::math::Im2ColFunctor< output_shape_vec.begin() + 2);
paddle::operators::math::ColFormat::kCFO, Place, T>
im2col; // use col_shape in the im2col calculation
// use col_shape in the im2col and col2im calculation // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
framework::DDim col_shape = {input_channels / groups, filter_height, // o_h, o_w}
filter_width, output_height, output_width}; std::vector<int64_t> col_shape_vec;
col_shape_vec.push_back(input->dims()[1] / groups);
col_shape_vec.insert(col_shape_vec.end(), filter_shape_vec.begin(),
filter_shape_vec.end());
col_shape_vec.insert(col_shape_vec.end(), output_shape_vec.begin(),
output_shape_vec.end());
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
// use col_matrix_shape in the gemm calculation // use col_matrix_shape in the gemm calculation
framework::DDim col_matrix_shape = { // size: (i_c/g * k_h * k_w, o_h * o_w)
input_channels / groups * filter_height * filter_width, // or
output_height * output_width}; // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
Tensor col; framework::DDim col_matrix_shape =
col.mutable_data<T>(col_shape, context.GetPlace()); framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix = col;
col_matrix.Resize(col_matrix_shape);
framework::DDim input_shape = {input->dims()[1], input->dims()[2], framework::DDim input_shape = framework::slice_ddim(
input->dims()[3]}; input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim output_matrix_shape = {
output_grad->dims()[1],
output_grad->dims()[2] * output_grad->dims()[3]};
framework::DDim filter_matrix_shape = {filter.dims()[0], framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]}; filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape); filter.Resize(filter_matrix_shape);
// convolution backward input operator: gemm + col2im framework::DDim output_matrix_shape = {
// convolution backward weight operator: im2col + gemm output_grad->dims()[1],
int in_step = input_channels / groups; output_grad->numel() /
int out_step = output_channels / groups; (output_grad->dims()[0] * output_grad->dims()[1])};
// convolution backward input operator: gemm + col2im(or col2vol)
// convolution backward weight operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output_grad->dims()[1]) / groups;
Tensor col;
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix;
col.mutable_data<T>(col_shape, context.GetPlace());
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
math::SetConstant<Place, T> set_zero;
if (input_grad) { if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*input_grad); set_zero(context.device_context(), input_grad, static_cast<T>(0));
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch = Tensor out_grad_batch =
...@@ -208,13 +257,22 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> { ...@@ -208,13 +257,22 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
math::matmul<Place, T>(context.device_context(), filter_slice, true, math::matmul<Place, T>(context.device_context(), filter_slice, true,
out_grad_slice, false, T(1.0), &col_matrix, out_grad_slice, false, T(1.0), &col_matrix,
T(0.0)); T(0.0));
// col2im // col2im
Tensor in_grad_slice = Tensor in_grad_slice =
in_grad_batch.Slice(g * in_step, (g + 1) * in_step); in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
col2im(context.device_context(), in_grad_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1], if (filter_shape_vec.size() == 2) {
paddings[1]); math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
col2im(context.device_context(), in_grad_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
} else if (filter_shape_vec.size() == 3) {
math::Col2VolFunctor<Place, T> col2vol;
col2vol(context.device_context(), in_grad_slice, col, strides[0],
strides[1], strides[2], paddings[0], paddings[1],
paddings[2]);
}
} }
} }
} }
...@@ -223,8 +281,7 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> { ...@@ -223,8 +281,7 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
filter_grad->mutable_data<T>(context.GetPlace()); filter_grad->mutable_data<T>(context.GetPlace());
Tensor filter_grad_ = *filter_grad; Tensor filter_grad_ = *filter_grad;
filter_grad_.Resize(filter_matrix_shape); filter_grad_.Resize(filter_matrix_shape);
auto t = framework::EigenVector<T>::Flatten(filter_grad_); set_zero(context.device_context(), filter_grad, static_cast<T>(0));
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch = Tensor out_grad_batch =
...@@ -235,9 +292,18 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> { ...@@ -235,9 +292,18 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
Tensor out_grad_slice = Tensor out_grad_slice =
out_grad_batch.Slice(g * out_step, (g + 1) * out_step); out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1], if (filter_shape_vec.size() == 2) {
paddings[1]); math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
} else if (filter_shape_vec.size() == 3) {
math::Vol2ColFunctor<Place, T> vol2col;
vol2col(context.device_context(), in_slice, col, strides[0],
strides[1], strides[2], paddings[0], paddings[1],
paddings[2]);
}
// gemm // gemm
Tensor filter_grad_slice = Tensor filter_grad_slice =
...@@ -250,6 +316,5 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> { ...@@ -250,6 +316,5 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
} }
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -96,14 +96,13 @@ as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401 ...@@ -96,14 +96,13 @@ as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401
The equation is: The equation is:
\f[ $$Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}$$
Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}
\f]
where X's index is computed modulo M, and b's index is computed modulo N. where X's index is computed modulo M, and Y's index is computed modulo N.
Both inputs X and Y can carry LoD (Level of Details) information.
However, the output only shares the LoD information with input X.
Both of the input `X` and `Y` can carry LoD (Level of Details) information.
However, the output only shares the LoD information with input `X`.
)DOC"); )DOC");
} }
}; };
......
...@@ -130,9 +130,7 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> { ...@@ -130,9 +130,7 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
dim3 grid_dim(num_x_blocks, batch_size); dim3 grid_dim(num_x_blocks, batch_size);
auto stream = reinterpret_cast<const platform::CUDADeviceContext &>( auto stream = context.cuda_device_context().stream();
context.device_context())
.stream();
conv_shift_forward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>( conv_shift_forward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
x_data, y_data, out_data, x_width, y_width, y_half_width, batch_size); x_data, y_data, out_data, x_width, y_width, y_half_width, batch_size);
...@@ -159,9 +157,7 @@ class ConvShiftGradKernel<platform::GPUPlace, T> ...@@ -159,9 +157,7 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
int y_width = Y->dims()[1]; int y_width = Y->dims()[1];
int y_half_width = (y_width - 1) / 2; int y_half_width = (y_width - 1) / 2;
auto stream = reinterpret_cast<const platform::CUDADeviceContext &>( auto stream = context.cuda_device_context().stream();
context.device_context())
.stream();
const int x_per_block = 256; const int x_per_block = 256;
int num_x_blocks = div_up(x_width, x_per_block); int num_x_blocks = div_up(x_width, x_per_block);
......
...@@ -12,18 +12,18 @@ ...@@ -12,18 +12,18 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/conv2dtranspose_op.h" #include "paddle/operators/conv_transpose_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const { void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("Input"), PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of Conv2DTransposeOp should not be null."); "Input(Input) of ConvTransposeOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Filter"), PADDLE_ENFORCE(ctx->HasInput("Filter"),
"Input(Filter) of Conv2DTransposeOp should not be null."); "Input(Filter) of ConvTransposeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"), PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of Conv2DTransposeOp should not be null."); "Output(Output) of ConvTransposeOp should not be null.");
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter"); auto filter_dims = ctx->GetInputDim("Filter");
...@@ -35,17 +35,27 @@ void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -35,17 +35,27 @@ void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
"No Padding allowed in conv transpose op."); "No Padding allowed in conv transpose op.");
} }
PADDLE_ENFORCE_EQ(in_dims.size(), 4, PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
"Conv2DTransposeOp input should be 4-D tensor."); "ConvTransposeOp intput should be 4-D or 5-D tensor.");
PADDLE_ENFORCE_EQ(filter_dims.size(), 4, PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
"Conv2DTransposeOp filter should be 4-D tensor."); "ConvTransposeOp input dimension and filter dimension "
"should be the same.");
PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U,
"ConvTransposeOp input dimension and strides dimension should "
"be consistent.");
PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
"ConvTransposeOp paddings dimension and Conv strides "
"dimension should be the same.");
PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0], PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
"input and kernel input dimension should be equal."); "In ConvTransposeOp, The input channel should be the same "
"as the number of filters.");
auto output_height = (in_dims[2] - 1) * strides[0] + filter_dims[2]; std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
auto output_width = (in_dims[3] - 1) * strides[1] + filter_dims[3]; for (size_t i = 0; i < paddings.size(); ++i) {
ctx->SetOutputDim("Output", output_shape.push_back((in_dims[i + 2] - 1) * strides[i] +
{in_dims[0], filter_dims[1], output_height, output_width}); filter_dims[i + 2]);
}
ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
} }
Conv2DTransposeOpMaker::Conv2DTransposeOpMaker( Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
...@@ -55,32 +65,108 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker( ...@@ -55,32 +65,108 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
"Input", "Input",
"(Tensor) The input tensor of convolution transpose operator. " "(Tensor) The input tensor of convolution transpose operator. "
"The format of input tensor is NCHW. Where N is batch size, C is the " "The format of input tensor is NCHW. Where N is batch size, C is the "
"number of input channels, H and W is the height and width of image."); "number of input channels, H is the height of the feature, and "
"W is the width of the feature.");
AddInput("Filter", AddInput("Filter",
"(Tensor) The filter tensor of convolution transpose operator." "(Tensor) The filter tensor of convolution transpose operator. "
"The format of the filter tensor is CMHW, where C is the number of " "The format of the filter tensor is CMHW, where C is the number of "
"output image channels, M is the number of input image channels, " "output image channels, M is the number of input image channels, "
"H and W is height and width of filter. " "H is the height of the filter, and W is the width of the filter. "
"We enforce groups number == 1 and padding == 0 in " "We enforce groups number == 1 and padding == 0 in "
"convolution transpose Scenario."); "the convolution transpose scenario.");
AddOutput("Output", AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator." "(Tensor) The output tensor of convolution transpose operator. "
"The format of output tensor is also NCHW."); "The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>("strides", AddAttr<std::vector<int>>(
"strides of convolution transpose operator.") "strides",
"(vector defalut:{1, 1}), strides of convolution transpose operator.")
.SetDefault({1, 1}); .SetDefault({1, 1});
AddAttr<std::vector<int>>("paddings", AddAttr<std::vector<int>>(
"paddings of convolution transpose operator.") "paddings",
"(vector defalut:{0, 0}), paddings of convolution transpose operator.")
.SetDefault({0, 0}); .SetDefault({0, 0});
AddComment(R"DOC( AddComment(R"DOC(
Convolution2D Transpose Operator.
The convolution transpose operation calculates the output based on the input, filter The convolution transpose operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape. parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
size, C is the number of channels, H is the height of the feature, and
W is the width of the feature. Parameters(ksize, strides, paddings) are two elements.
These two elements represent height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
Input shape: (N, C_in, H_in, W_in)
Filter shape: (C_in, C_out, H_f, W_f)
Output:
Output shape: (N, C_out, H_out, W_out)
where
H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
)DOC"); )DOC");
} }
void Conv2DTransposeOpGrad::InferShape( Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
framework::InferShapeContext* ctx) const { framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Input",
"(Tensor) The input tensor of convolution transpose operator."
"The format of input tensor is NCDHW. Where N is batch size, C is "
"the number of channels, D is the depth of the feature, H is the "
"height of the feature, and "
"W is the width of the feature.");
AddInput("Filter",
"(Tensor) The filter tensor of convolution transpose operator."
"The format of the filter tensor is CMDHW, where C is the number of "
"output image channels, M is the number of input image channels, D "
"is the depth of the filter, H is the height of the filter, and "
"W is the width of the filter."
"We enforce groups number == 1 and padding == 0 in "
"the convolution3d transpose scenario.");
AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator."
"The format of output tensor is also NCDHW."
"Where N is batch size, C is "
"the number of channels, D is the depth of the feature, H is the "
"height of the feature, and W is the width of the feature.");
AddAttr<std::vector<int>>(
"strides",
"(vector defalut:{1, 1, 1}), strides of convolution transpose operator.")
.SetDefault({1, 1, 1});
AddAttr<std::vector<int>>(
"paddings",
"(vector defalut:{0, 0, 0}), paddings of convolution transpose operator.")
.SetDefault({0, 0, 0});
AddComment(R"DOC(
Convolution3D Transpose Operator.
The convolution transpose operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
size, C is the number of channels, D is the depth of the feature,
H is the height of the feature, and W is the width of the feature.
Parameters(ksize, strides, paddings) are three elements.
These three elements represent depth, height and width, respectively.
The input(X) size and output(Out) size may be different.
Example:
Input:
Input shape: (N, C_in, D_in, H_in, W_in)
Filter shape: (C_in, C_out, D_f, H_f, W_f)
Output:
Output shape: (N, C_out, D_out, H_out, W_out)
where
D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2];
)DOC");
}
void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter"); auto filter_dims = ctx->GetInputDim("Filter");
if (ctx->HasOutput(framework::GradVarName("Input"))) { if (ctx->HasOutput(framework::GradVarName("Input"))) {
...@@ -95,13 +181,23 @@ void Conv2DTransposeOpGrad::InferShape( ...@@ -95,13 +181,23 @@ void Conv2DTransposeOpGrad::InferShape(
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(conv2dtranspose, ops::Conv2DTransposeOp,
ops::Conv2DTransposeOpMaker, conv2dtranspose_grad, REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
ops::Conv2DTransposeOpGrad); conv2d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
conv2d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
conv2d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
conv3d_transpose_grad, ops::ConvTransposeOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv2dtranspose, conv3d_transpose,
ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>); ops::GemmConvTransposeKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
conv2dtranspose_grad, conv3d_transpose_grad,
ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>); ops::GemmConvTransposeGradKernel<paddle::platform::CPUPlace, float>);
...@@ -12,13 +12,20 @@ ...@@ -12,13 +12,20 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/conv2dtranspose_op.h" #include "paddle/operators/conv_transpose_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
conv2dtranspose, conv2d_transpose,
ops::GemmConv2DTransposeKernel<paddle::platform::GPUPlace, float>); ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
conv2dtranspose_grad, conv2d_transpose_grad,
ops::GemmConv2DTransposeGradKernel<paddle::platform::GPUPlace, float>); ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
conv3d_transpose,
ops::GemmConvTransposeKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
conv3d_transpose_grad,
ops::GemmConvTransposeGradKernel<paddle::platform::GPUPlace, float>);
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册