Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
32ca9fe7
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
32ca9fe7
编写于
8月 16, 2018
作者:
M
minqiyang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into change_manylinux1_Docker
上级
ba84a6b7
d7873e14
变更
44
隐藏空白更改
内联
并排
Showing
44 changed file
with
593 addition
and
222 deletion
+593
-222
.gitignore
.gitignore
+2
-0
CMakeLists.txt
CMakeLists.txt
+2
-1
cmake/configure.cmake
cmake/configure.cmake
+8
-0
cmake/external/anakin.cmake
cmake/external/anakin.cmake
+2
-3
paddle/fluid/API.spec
paddle/fluid/API.spec
+3
-2
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+1
-1
paddle/fluid/operators/activation_op.cu
paddle/fluid/operators/activation_op.cu
+3
-1
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+6
-6
paddle/fluid/operators/assign_value_op.cu.cc
paddle/fluid/operators/assign_value_op.cu.cc
+4
-1
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+36
-20
paddle/fluid/operators/cross_entropy_op.cu
paddle/fluid/operators/cross_entropy_op.cu
+8
-4
paddle/fluid/operators/elementwise_add_op.cu
paddle/fluid/operators/elementwise_add_op.cu
+2
-1
paddle/fluid/operators/elementwise_div_op.cu
paddle/fluid/operators/elementwise_div_op.cu
+7
-2
paddle/fluid/operators/elementwise_mul_op.cu
paddle/fluid/operators/elementwise_mul_op.cu
+7
-1
paddle/fluid/operators/elementwise_op_function.h
paddle/fluid/operators/elementwise_op_function.h
+2
-2
paddle/fluid/operators/elementwise_sub_op.cu
paddle/fluid/operators/elementwise_sub_op.cu
+7
-1
paddle/fluid/operators/fill_constant_op.cc
paddle/fluid/operators/fill_constant_op.cc
+19
-34
paddle/fluid/operators/fill_constant_op.cu.cc
paddle/fluid/operators/fill_constant_op.cu.cc
+26
-0
paddle/fluid/operators/fill_constant_op.h
paddle/fluid/operators/fill_constant_op.h
+48
-0
paddle/fluid/operators/fill_op.cc
paddle/fluid/operators/fill_op.cc
+1
-1
paddle/fluid/operators/gaussian_random_op.cu
paddle/fluid/operators/gaussian_random_op.cu
+2
-0
paddle/fluid/operators/math/cross_entropy.cu
paddle/fluid/operators/math/cross_entropy.cu
+18
-2
paddle/fluid/operators/math/cross_entropy.h
paddle/fluid/operators/math/cross_entropy.h
+17
-0
paddle/fluid/operators/math/selected_rows_functor.cu
paddle/fluid/operators/math/selected_rows_functor.cu
+11
-2
paddle/fluid/operators/math/softmax.cu
paddle/fluid/operators/math/softmax.cu
+3
-0
paddle/fluid/operators/mean_op.cu
paddle/fluid/operators/mean_op.cu
+6
-4
paddle/fluid/operators/mean_op.h
paddle/fluid/operators/mean_op.h
+1
-1
paddle/fluid/operators/mul_op.cu.cc
paddle/fluid/operators/mul_op.cu.cc
+4
-3
paddle/fluid/operators/pool_cudnn_op.cu.cc
paddle/fluid/operators/pool_cudnn_op.cu.cc
+4
-2
paddle/fluid/operators/prelu_op.cc
paddle/fluid/operators/prelu_op.cc
+52
-13
paddle/fluid/operators/prelu_op.cu
paddle/fluid/operators/prelu_op.cu
+0
-22
paddle/fluid/operators/prelu_op.h
paddle/fluid/operators/prelu_op.h
+73
-52
paddle/fluid/operators/scale_op.cu
paddle/fluid/operators/scale_op.cu
+5
-1
paddle/fluid/operators/scatter_op.h
paddle/fluid/operators/scatter_op.h
+2
-2
paddle/fluid/operators/softmax_cudnn_op.cu.cc
paddle/fluid/operators/softmax_cudnn_op.cu.cc
+2
-1
paddle/fluid/operators/softmax_op.cu.cc
paddle/fluid/operators/softmax_op.cu.cc
+2
-1
paddle/fluid/operators/sum_op.cu
paddle/fluid/operators/sum_op.cu
+4
-1
paddle/fluid/operators/sum_op.h
paddle/fluid/operators/sum_op.h
+1
-1
paddle/fluid/operators/top_k_op.cu
paddle/fluid/operators/top_k_op.cu
+22
-6
paddle/fluid/operators/uniform_random_op.cu
paddle/fluid/operators/uniform_random_op.cu
+50
-9
python/CMakeLists.txt
python/CMakeLists.txt
+3
-2
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+60
-2
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+15
-0
python/paddle/fluid/tests/unittests/test_prelu_op.py
python/paddle/fluid/tests/unittests/test_prelu_op.py
+42
-14
未找到文件。
.gitignore
浏览文件 @
32ca9fe7
...
...
@@ -5,6 +5,7 @@ python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
*.DS_Store
*.vs
build/
build_doc/
*.user
...
...
@@ -15,6 +16,7 @@ build_doc/
.cproject
.pydevproject
.settings/
CMakeSettings.json
Makefile
.test_env/
third_party/
...
...
CMakeLists.txt
浏览文件 @
32ca9fe7
...
...
@@ -204,11 +204,12 @@ include(external/snappy) # download snappy
include
(
external/snappystream
)
include
(
external/threadpool
)
set
(
WITH_ANAKIN OFF CACHE STRING
"Disable Anakin first, will add it later."
FORCE
)
if
(
WITH_GPU
)
include
(
cuda
)
include
(
tensorrt
)
include
(
external/anakin
)
elseif
()
set
(
WITH_ANAKIN OFF CACHE STRING
"Anakin is used in GPU only now."
FORCE
)
endif
()
include
(
cudnn
)
# set cudnn libraries, must before configure
...
...
cmake/configure.cmake
浏览文件 @
32ca9fe7
...
...
@@ -56,6 +56,10 @@ if(NOT CMAKE_CROSSCOMPILING)
set
(
SIMD_FLAG
${
SSE3_FLAG
}
)
endif
()
endif
()
if
(
UNIX AND NOT APPLE
)
# except apple from nix*Os family
set
(
LINUX TRUE
)
endif
(
UNIX AND NOT APPLE
)
if
(
NOT WITH_GOLANG
)
add_definitions
(
-DPADDLE_WITHOUT_GOLANG
)
...
...
@@ -104,6 +108,10 @@ if(WITH_GPU)
if
(
${
CUDNN_MAJOR_VERSION
}
VERSION_LESS 7
)
message
(
FATAL_ERROR
"Anakin needs CUDNN >= 7.0 to compile"
)
endif
()
set
(
ENV{CUDNN_INCLUDE_DIR}
${
CUDNN_INCLUDE_DIR
}
)
set
(
ENV{CUDNN_LIBRARY}
${
CUDNN_LIBRARY
}
)
message
(
STATUS
"cudnn include header is
${
CUDNN_INCLUDE_DIR
}
/cudnn.h"
)
message
(
STATUS
"cudnn library is
${
CUDNN_LIBRARY
}
"
)
endif
()
elseif
(
WITH_AMD_GPU
)
add_definitions
(
-DPADDLE_WITH_HIP
)
...
...
cmake/external/anakin.cmake
浏览文件 @
32ca9fe7
...
...
@@ -35,9 +35,8 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
ExternalProject_Add
(
extern_anakin
${
EXTERNAL_PROJECT_LOG_ARGS
}
# TODO(luotao): use PaddlePaddle/Anakin later
GIT_REPOSITORY
"https://github.com/luotao1/Anakin"
GIT_TAG
"3957ae9263eaa0b1986758dac60a88852afb09be"
GIT_REPOSITORY
"https://github.com/PaddlePaddle/Anakin"
GIT_TAG
"04256ba78fa3da0beb74e8036c8efd68c12824d6"
PREFIX
${
ANAKIN_SOURCE_DIR
}
UPDATE_COMMAND
""
CMAKE_ARGS -DUSE_GPU_PLACE=YES
...
...
paddle/fluid/API.spec
浏览文件 @
32ca9fe7
...
...
@@ -155,10 +155,11 @@ paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale',
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.relu ArgSpec(args=['x'
], varargs=None, keywords=None, defaults=None
)
paddle.fluid.layers.log ArgSpec(args=['x'
], varargs=None, keywords=None, defaults=None
)
paddle.fluid.layers.relu ArgSpec(args=['x'
, 'name'], varargs=None, keywords=None, defaults=(None,)
)
paddle.fluid.layers.log ArgSpec(args=['x'
, 'name'], varargs=None, keywords=None, defaults=(None,)
)
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
32ca9fe7
...
...
@@ -60,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
inference_api_test
(
test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec
)
endif
()
if
(
WITH_ANAKIN
)
# only needed in CI
if
(
WITH_ANAKIN
AND WITH_GPU
)
# only needed in CI
# compile the libinference_anakin_api.a and anakin.so.
nv_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber
)
#nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
...
...
paddle/fluid/operators/activation_op.cu
浏览文件 @
32ca9fe7
...
...
@@ -26,6 +26,8 @@ namespace plat = paddle::platform;
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>);
ops::grad_functor<double>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>);
FOR_EACH_KERNEL_FUNCTOR
(
REGISTER_ACTIVATION_CUDA_KERNEL
);
paddle/fluid/operators/activation_op.h
浏览文件 @
32ca9fe7
...
...
@@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
const
Out
out_conj
=
Eigen
::
numext
::
conj
(
out
);
dx
.
device
(
d
)
=
static_cast
<
T
>
(
0.5
)
*
dout
/
out_conj
;
dx
.
device
(
d
)
=
static_cast
<
T
>
(
0.5
)
*
dout
/
out
;
}
};
...
...
@@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
static_cast
<
T
>
(
factor
)
*
x
.
pow
(
static_cast
<
T
>
(
factor
-
static_cast
<
T
>
(
1
)
));
x
.
pow
(
static_cast
<
T
>
(
factor
)
-
static_cast
<
T
>
(
1
));
}
};
...
...
@@ -863,10 +862,11 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
T
b
=
static_cast
<
T
>
(
beta
);
auto
temp1
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
(
static_cast
<
T
>
(
-
b
eta
)
*
x
).
exp
());
auto
temp2
=
temp1
*
(
static_cast
<
T
>
(
1
)
-
(
b
eta
*
out
));
dx
.
device
(
d
)
=
dout
*
((
b
eta
*
out
)
+
temp2
);
(
static_cast
<
T
>
(
1
)
+
(
static_cast
<
T
>
(
-
b
)
*
x
).
exp
());
auto
temp2
=
temp1
*
(
static_cast
<
T
>
(
1
)
-
(
b
*
out
));
dx
.
device
(
d
)
=
dout
*
((
b
*
out
)
+
temp2
);
}
};
...
...
paddle/fluid/operators/assign_value_op.cu.cc
浏览文件 @
32ca9fe7
...
...
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/assign_value_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
assign_value
,
ops
::
AssignValueKernel
<
int
>
,
ops
::
AssignValueKernel
<
float
>
);
ops
::
AssignValueKernel
<
float
>
,
ops
::
AssignValueKernel
<
plat
::
float16
>
);
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
32ca9fe7
...
...
@@ -39,6 +39,27 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
static
constexpr
size_t
kCONV_CUDNN_WORKSPACE_LIMIT_BYTES
=
static_cast
<
size_t
>
(
1024
)
*
1024
*
1024
;
template
<
typename
T
,
typename
DeviceContext
>
// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
bool
EnableFp16
(
const
DeviceContext
&
dev_ctx
,
cudnnConvolutionDescriptor_t
cudnn_conv_desc
)
{
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
return
true
;
}
else
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
}
#endif
return
false
;
}
template
<
typename
T
>
class
CUDNNConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -128,27 +149,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionFwdAlgo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
// Currently tensor core is only enabled using this algo
if
(
EnableFp16
<
T
>
(
dev_ctx
,
cudnn_conv_desc
))
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
;
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
}
#endif
// get workspace size able to allocate
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
...
...
@@ -288,6 +296,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
else
{
data_algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
if
(
EnableFp16
<
T
>
(
dev_ctx
,
cudnn_conv_desc
))
{
data_algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
...
...
@@ -307,6 +318,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
else
{
filter_algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
}
if
(
EnableFp16
<
T
>
(
dev_ctx
,
cudnn_conv_desc
))
{
filter_algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
...
...
@@ -362,7 +376,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle
::
operators
::
CUDNNConvOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv2d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
);
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv3d
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvOpKernel
<
float
>
,
...
...
@@ -370,4 +385,5 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle
::
operators
::
CUDNNConvOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv3d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
);
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
plat
::
float16
>
)
paddle/fluid/operators/cross_entropy_op.cu
浏览文件 @
32ca9fe7
...
...
@@ -13,12 +13,16 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
using
CUDACtx
=
paddle
::
platform
::
CUDADeviceContext
;
REGISTER_OP_CUDA_KERNEL
(
cross_entropy
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
cross_entropy_grad
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
double
>
);
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
double
>
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
cross_entropy_grad
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
double
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
plat
::
float16
>
);
paddle/fluid/operators/elementwise_add_op.cu
浏览文件 @
32ca9fe7
...
...
@@ -30,4 +30,5 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/elementwise_div_op.cu
浏览文件 @
32ca9fe7
...
...
@@ -14,19 +14,24 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_div_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
elementwise_div
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
elementwise_div_grad
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
plat
::
float16
>
);
paddle/fluid/operators/elementwise_mul_op.cu
浏览文件 @
32ca9fe7
...
...
@@ -14,19 +14,25 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
elementwise_mul
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
elementwise_mul_grad
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/elementwise_op_function.h
浏览文件 @
32ca9fe7
...
...
@@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
int
j
=
blockIdx
.
x
;
int
i
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
T
val
=
0
;
T
val
(
0
)
;
do
{
int
x_offset
=
i
*
w
+
j
;
...
...
@@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
int
tid
=
threadIdx
.
x
;
int
j
=
blockIdx
.
x
;
T
val
=
0
;
T
val
(
0
)
;
int
ttid
=
tid
;
while
(
true
)
{
...
...
paddle/fluid/operators/elementwise_sub_op.cu
浏览文件 @
32ca9fe7
...
...
@@ -14,19 +14,25 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_sub_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
elementwise_sub
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
elementwise_sub_grad
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/fill_constant_op.cc
浏览文件 @
32ca9fe7
...
...
@@ -12,48 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
class
FillConstant
InferShape
:
public
framework
::
InferShapeBase
{
class
FillConstant
Op
:
public
framework
::
OperatorWithKernel
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of FillConstantOp should not be null."
);
auto
&
shape
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape"
);
auto
&
shape
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape"
);
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
shape
));
}
};
class
FillConstantOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
auto
data_type
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
Attr
<
int
>
(
"dtype"
));
auto
value
=
Attr
<
float
>
(
"value"
);
auto
force_cpu
=
Attr
<
bool
>
(
"force_cpu"
);
auto
&
out
=
*
scope
.
FindVar
(
Output
(
"Out"
))
->
GetMutable
<
framework
::
LoDTensor
>
();
out
.
Resize
(
framework
::
make_ddim
(
Attr
<
std
::
vector
<
int
>>
(
"shape"
)));
if
(
force_cpu
)
{
auto
cpu
=
platform
::
CPUPlace
();
out
.
mutable_data
(
cpu
,
framework
::
ToTypeIndex
(
data_type
));
}
else
{
out
.
mutable_data
(
dev_place
,
framework
::
ToTypeIndex
(
data_type
));
}
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
math
::
set_constant
(
dev_ctx
,
&
out
,
value
);
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
ctx
.
Attr
<
int
>
(
"dtype"
)),
ctx
.
device_context
());
}
};
...
...
@@ -87,6 +67,11 @@ Fill up a variable with specified constant value.
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fill_constant
,
ops
::
FillConstantOp
,
ops
::
FillConstantInferShape
,
ops
::
FillConstantOpMaker
,
REGISTER_OPERATOR
(
fill_constant
,
ops
::
FillConstantOp
,
ops
::
FillConstantOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
fill_constant
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
)
paddle/fluid/operators/fill_constant_op.cu.cc
0 → 100644
浏览文件 @
32ca9fe7
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
fill_constant
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
)
paddle/fluid/operators/fill_constant_op.h
0 → 100644
浏览文件 @
32ca9fe7
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
FillConstantOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
ctx
.
Attr
<
int
>
(
"dtype"
));
auto
value
=
ctx
.
Attr
<
float
>
(
"value"
);
auto
force_cpu
=
ctx
.
Attr
<
bool
>
(
"force_cpu"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
out
->
Resize
(
framework
::
make_ddim
(
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"shape"
)));
if
(
force_cpu
)
{
auto
cpu
=
platform
::
CPUPlace
();
out
->
mutable_data
(
cpu
,
framework
::
ToTypeIndex
(
data_type
));
}
else
{
out
->
mutable_data
(
ctx
.
GetPlace
(),
framework
::
ToTypeIndex
(
data_type
));
}
math
::
set_constant
(
ctx
.
template
device_context
<
DeviceContext
>(),
out
,
value
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fill_op.cc
浏览文件 @
32ca9fe7
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -69,7 +70,6 @@ class FillOp : public framework::OperatorBase {
framework
::
VisitDataType
(
dtype
,
FillOpVisitor
(
&
tensor
,
Attr
<
std
::
vector
<
float
>>
(
"value"
)));
if
(
!
force_cpu
&&
platform
::
is_gpu_place
(
place
))
{
// Copy tensor to out
platform
::
DeviceContextPool
&
pool
=
...
...
paddle/fluid/operators/gaussian_random_op.cu
浏览文件 @
32ca9fe7
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#include <thrust/transform.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -60,6 +61,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
}
// namespace operators
}
// namespace paddle
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
gaussian_random
,
paddle
::
operators
::
GPUGaussianRandomKernel
<
float
>
,
paddle
::
operators
::
GPUGaussianRandomKernel
<
double
>
);
...
...
paddle/fluid/operators/math/cross_entropy.cu
浏览文件 @
32ca9fe7
...
...
@@ -15,11 +15,25 @@ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
<
typename
T
>
HOSTDEVICE
T
log
(
const
T
&
val
)
{
return
std
::
log
(
val
);
}
template
<
>
HOSTDEVICE
platform
::
float16
log
(
const
platform
::
float16
&
val
)
{
// strage bug, hlog is not exists.
return
static_cast
<
float16
>
(
0
);
// half tmp = static_cast<half>(val);
// return static_cast<platform::float16>(hlog(tmp));
}
namespace
{
template
<
typename
T
>
__global__
void
CrossEntropyKernel
(
T
*
Y
,
const
T
*
X
,
const
int64_t
*
label
,
...
...
@@ -35,12 +49,12 @@ template <typename T>
__global__
void
SoftCrossEntropyKernel
(
T
*
Y
,
const
T
*
X
,
const
T
*
label
,
const
int
class_num
)
{
int
tid
=
threadIdx
.
x
;
T
val
=
0
;
T
val
(
0
)
;
int
idx
=
blockIdx
.
x
*
class_num
+
tid
;
int
end
=
blockIdx
.
x
*
class_num
+
class_num
;
for
(;
idx
<
end
;
idx
+=
blockDim
.
x
)
{
val
+=
math
::
TolerableValue
<
T
>
()(
std
::
log
(
X
[
idx
]))
*
label
[
idx
];
val
+=
math
::
TolerableValue
<
T
>
()(
log
(
X
[
idx
]))
*
label
[
idx
];
}
val
=
paddle
::
platform
::
reduceSum
(
val
,
tid
,
blockDim
.
x
);
...
...
@@ -84,6 +98,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/cross_entropy.h
浏览文件 @
32ca9fe7
...
...
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <limits>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace
paddle
{
...
...
@@ -33,6 +35,21 @@ struct TolerableValue {
}
};
// float16 value clip behave different.
using
paddle
::
platform
::
float16
;
using
paddle
::
platform
::
isfinite
;
template
<
>
struct
TolerableValue
<
float16
>
{
HOSTDEVICE
float16
operator
()(
const
float16
&
x
)
const
{
if
(
isfinite
(
x
))
return
x
;
else
if
(
x
>
static_cast
<
float16
>
(
0
))
return
std
::
numeric_limits
<
float16
>::
max
();
else
return
std
::
numeric_limits
<
float16
>::
min
();
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
CrossEntropyFunctor
{
public:
...
...
paddle/fluid/operators/math/selected_rows_functor.cu
浏览文件 @
32ca9fe7
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -76,6 +77,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
{
template
<
typename
T
,
int
block_size
>
...
...
@@ -120,7 +122,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
auto
*
out_data
=
output
->
data
<
T
>
();
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
functor
;
functor
(
context
,
output
,
0.0
);
functor
(
context
,
output
,
static_cast
<
T
>
(
0
)
);
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
...
...
@@ -138,6 +140,8 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
<
typename
T
>
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
T
>
{
...
...
@@ -177,6 +181,8 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
{
template
<
typename
T
,
int
block_size
>
...
...
@@ -229,6 +235,8 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
scatter
{
...
...
@@ -276,7 +284,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
,
out
.
mutable_value
(),
0.0
);
constant_functor
(
context
,
out
.
mutable_value
(),
static_cast
<
T
>
(
0
)
);
auto
*
out_data
=
out
.
mutable_value
()
->
data
<
T
>
();
auto
*
input_data
=
input
.
value
().
data
<
T
>
();
...
...
@@ -300,6 +308,7 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
<
typename
T
,
int
block_size
>
__global__
void
UpdateToTensorKernel
(
const
T
*
selected_rows
,
...
...
paddle/fluid/operators/math/softmax.cu
浏览文件 @
32ca9fe7
...
...
@@ -94,12 +94,15 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
>;
template
class
SoftmaxCUDNNFunctor
<
float
>;
template
class
SoftmaxCUDNNFunctor
<
double
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
...
...
paddle/fluid/operators/mean_op.cu
浏览文件 @
32ca9fe7
...
...
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/mean_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
mean
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
mean_grad
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/mean_op.h
浏览文件 @
32ca9fe7
...
...
@@ -55,7 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
IG
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
ig_size
=
static_cast
<
T
>
(
IG
->
numel
());
Eigen
::
DSizes
<
int
,
1
>
bcast
(
ig_size
);
Eigen
::
DSizes
<
int
,
1
>
bcast
(
static_cast
<
int
>
(
ig_size
)
);
EigenVector
<
T
>::
Flatten
(
*
IG
).
device
(
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
())
=
...
...
paddle/fluid/operators/mul_op.cu.cc
浏览文件 @
32ca9fe7
...
...
@@ -20,6 +20,7 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL
(
mul
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
mul_grad
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
mul_grad
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/pool_cudnn_op.cu.cc
浏览文件 @
32ca9fe7
...
...
@@ -174,7 +174,8 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
ops
::
PoolCUDNNOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool2d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNGradOpKernel
<
float
>
,
ops
::
PoolCUDNNGradOpKernel
<
double
>
);
ops
::
PoolCUDNNGradOpKernel
<
double
>
,
ops
::
PoolCUDNNGradOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool3d
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNOpKernel
<
float
>
,
...
...
@@ -182,4 +183,5 @@ REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
ops
::
PoolCUDNNOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool3d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNGradOpKernel
<
float
>
,
ops
::
PoolCUDNNGradOpKernel
<
double
>
);
ops
::
PoolCUDNNGradOpKernel
<
double
>
,
ops
::
PoolCUDNNGradOpKernel
<
plat
::
float16
>
);
paddle/fluid/operators/prelu_op.cc
浏览文件 @
32ca9fe7
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -26,14 +23,40 @@ class PReluOp : public framework::OperatorWithKernel {
:
OperatorWithKernel
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
std
::
string
mode
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"mode"
);
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) should not be null"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Alpha"
),
"Input(Alpha) should not be null"
);
PADDLE_ENFORCE
(
product
(
ctx
->
GetInputDim
(
"Alpha"
))
==
1
,
"Size of weight Alpha must be one."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) should not be null"
);
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"X"
));
if
(
mode
==
"all"
)
{
PADDLE_ENFORCE
(
product
(
ctx
->
GetInputDim
(
"Alpha"
))
==
1
,
"For mode 'all', size of weight Alpha must be one."
);
}
else
if
(
mode
==
"channel"
)
{
PADDLE_ENFORCE
(
product
(
ctx
->
GetInputDim
(
"Alpha"
))
==
x_dim
[
1
],
"For channel-wise mode, size of weight Alpha must be "
"equal to the number of channels, should be %d"
,
x_dim
[
1
]);
}
else
if
(
mode
==
"element"
)
{
PADDLE_ENFORCE
(
product
(
ctx
->
GetInputDim
(
"Alpha"
))
==
product
(
x_dim
),
"For element-wise mode, size of weight Alpha must be "
"equal to the number of input, should be %d"
,
product
(
x_dim
));
}
else
{
PADDLE_THROW
(
"Unkown mode %s"
,
mode
);
}
ctx
->
SetOutputDim
(
"Out"
,
x_dim
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
platform
::
CPUPlace
());
}
};
class
PReluOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
...
...
@@ -44,9 +67,7 @@ class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
"The output tensor of prelu operator."
);
AddComment
(
R"DOC(
PRelu Operator.
The equation is:
$$
f(x) =
\begin{cases}
...
...
@@ -54,11 +75,15 @@ f(x) =
x, \qquad \text{if} \ x >= 0
\end{cases}
$$
The input `X` can carry the LoD (Level of Details) information,
or not. And the output shares the LoD information with input `X`.
There are modes:
all: all elements share same weight
channel: elements in a channel share same weight
element: each element has a weight
)DOC"
);
AddAttr
<
std
::
string
>
(
"mode"
,
"The mode for inputs to share weights."
)
.
SetDefault
(
"all"
);
}
};
...
...
@@ -71,9 +96,23 @@ class PReluGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) must not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out@GRAD) should not be null"
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Alpha"
),
ctx
->
GetInputDim
(
"Alpha"
));
auto
x_grad_name
=
framework
::
GradVarName
(
"X"
);
auto
alpha_grad_name
=
framework
::
GradVarName
(
"Alpha"
);
if
(
ctx
->
HasOutput
(
x_grad_name
))
{
ctx
->
SetOutputDim
(
x_grad_name
,
ctx
->
GetInputDim
(
"X"
));
}
if
(
ctx
->
HasOutput
(
alpha_grad_name
))
{
ctx
->
SetOutputDim
(
alpha_grad_name
,
ctx
->
GetInputDim
(
"Alpha"
));
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
platform
::
CPUPlace
());
}
};
...
...
paddle/fluid/operators/prelu_op.cu
已删除
100644 → 0
浏览文件 @
ba84a6b7
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/prelu_op.h"
REGISTER_OP_CUDA_KERNEL
(
prelu
,
paddle
::
operators
::
PReluKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
);
REGISTER_OP_CUDA_KERNEL
(
prelu_grad
,
paddle
::
operators
::
PReluGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
);
paddle/fluid/operators/prelu_op.h
浏览文件 @
32ca9fe7
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -13,32 +10,16 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/transform.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
platform
::
Transform
;
template
<
typename
T
>
class
PReluFunctor
{
public:
explicit
PReluFunctor
(
const
T
*
alpha
)
:
alpha_
(
alpha
)
{}
HOSTDEVICE
T
operator
()(
const
T
&
x
)
const
{
if
(
x
>
0
)
return
x
;
else
return
x
*
(
*
alpha_
);
}
private:
const
T
*
alpha_
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
PReluKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -50,53 +31,93 @@ class PReluKernel : public framework::OpKernel<T> {
const
T
*
x_ptr
=
x
->
data
<
T
>
();
T
*
o_ptr
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
alpha_ptr
=
alpha
->
data
<
T
>
();
const
T
*
alpha_ptr
=
alpha
->
data
<
T
>
();
std
::
string
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
int
numel
=
x
->
numel
();
Transform
<
DeviceContext
>
trans
;
trans
(
context
.
template
device_context
<
DeviceContext
>(),
x_ptr
,
x_ptr
+
numel
,
o_ptr
,
PReluFunctor
<
T
>
(
alpha_ptr
));
}
};
template
<
typename
T
>
class
PReluGradFunctor
{
public:
explicit
PReluGradFunctor
(
const
T
*
alpha
)
:
alpha_
(
alpha
)
{}
HOSTDEVICE
T
operator
()(
const
T
&
out
,
const
T
&
dout
)
const
{
if
(
out
>
0
)
return
dout
;
else
return
dout
*
(
*
alpha_
);
auto
dim
=
x
->
dims
();
int
index
=
0
;
int
i
=
0
;
int
temp
=
0
;
if
(
mode
==
"channel"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
temp
=
numel
/
(
dim
[
0
]
*
dim
[
1
]);
index
=
(
i
/
temp
)
%
dim
[
1
];
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
index
]
*
x_ptr
[
i
];
}
}
else
if
(
mode
==
"element"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
i
]
*
x_ptr
[
i
];
}
}
else
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
0
]
*
x_ptr
[
i
];
}
}
}
private:
const
T
*
alpha_
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
PReluGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
dx
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dout
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dalpha
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Alpha"
));
auto
*
out
=
context
.
Input
<
Tensor
>
(
"Out"
);
auto
*
alpha
=
context
.
Input
<
Tensor
>
(
"Alpha"
);
auto
*
alpha_ptr
=
alpha
->
data
<
T
>
();
T
*
dx_ptr
=
dx
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
T
*
alpha_ptr
=
alpha
->
data
<
T
>
();
const
T
*
x_ptr
=
x
->
data
<
T
>
();
const
T
*
dout_ptr
=
dout
->
data
<
T
>
();
const
T
*
out_ptr
=
out
->
data
<
T
>
();
int
numel
=
dx
->
numel
();
Transform
<
DeviceContext
>
trans
;
trans
(
context
.
template
device_context
<
DeviceContext
>(),
out_ptr
,
out_ptr
+
numel
,
dout_ptr
,
dx_ptr
,
PReluGradFunctor
<
T
>
(
alpha_ptr
));
// TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
std
::
string
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
int
numel
=
x
->
numel
();
auto
dim
=
x
->
dims
();
int
index
=
0
;
int
i
=
0
;
int
temp
=
0
;
if
(
dx
)
{
T
*
dx_ptr
=
dx
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
mode
==
"channel"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
temp
=
numel
/
(
dim
[
0
]
*
dim
[
1
]);
index
=
(
i
/
temp
)
%
dim
[
1
];
dx_ptr
[
i
]
=
out_ptr
[
i
]
>
0
?
dout_ptr
[
i
]
:
alpha_ptr
[
index
]
*
dout_ptr
[
i
];
}
}
else
if
(
mode
==
"element"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
dx_ptr
[
i
]
=
out_ptr
[
i
]
>
0
?
dout_ptr
[
i
]
:
alpha_ptr
[
i
]
*
dout_ptr
[
i
];
}
}
else
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
dx_ptr
[
i
]
=
out_ptr
[
i
]
>
0
?
dout_ptr
[
i
]
:
alpha_ptr
[
0
]
*
dout_ptr
[
i
];
}
}
}
index
=
0
;
if
(
dalpha
)
{
T
*
dalpha_ptr
=
dalpha
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
mode
==
"channel"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
temp
=
numel
/
(
dim
[
0
]
*
dim
[
1
]);
index
=
(
i
/
temp
)
%
dim
[
1
];
dalpha_ptr
[
index
]
+=
out_ptr
[
i
]
>
0
?
0
:
x_ptr
[
i
]
*
dout_ptr
[
i
];
}
}
else
if
(
mode
==
"element"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
dalpha_ptr
[
i
]
+=
out_ptr
[
i
]
>
0
?
0
:
x_ptr
[
i
]
*
dout_ptr
[
i
];
}
}
else
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
dalpha_ptr
[
0
]
+=
out_ptr
[
i
]
>
0
?
0
:
x_ptr
[
i
]
*
dout_ptr
[
i
];
}
}
}
// TODO(Guanzhong): add GPU kernels
}
};
...
...
paddle/fluid/operators/scale_op.cu
浏览文件 @
32ca9fe7
...
...
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/scale_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
scale
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
int64_t
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/scatter_op.h
浏览文件 @
32ca9fe7
...
...
@@ -35,7 +35,7 @@ class ScatterOpKernel : public framework::OpKernel<T> {
auto
*
Out
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
// In place output: Out = X, Out[Ids] += Updates
Out
->
ShareDataWith
(
*
X
);
framework
::
TensorCopySync
(
*
X
,
ctx
.
GetPlace
(),
Out
);
// Apply ScatterUpdate: Out[index] += Updates[:]
ScatterAssign
<
T
>
(
ctx
.
device_context
(),
*
Updates
,
*
Ids
,
Out
);
}
...
...
@@ -53,7 +53,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
auto
*
dOut
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
// In place gradient: dX = dO
dX
->
ShareDataWith
(
*
dOut
);
framework
::
TensorCopySync
(
*
dOut
,
ctx
.
GetPlace
(),
dX
);
dUpdates
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Gradient by Gather: dUpdates += dO[Ids]
CPUGather
<
T
>
(
ctx
.
device_context
(),
*
dOut
,
*
Ids
,
dUpdates
);
...
...
paddle/fluid/operators/softmax_cudnn_op.cu.cc
浏览文件 @
32ca9fe7
...
...
@@ -78,4 +78,5 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
ops
::
SoftmaxCUDNNKernel
<
float
>
,
ops
::
SoftmaxCUDNNKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
softmax_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
SoftmaxGradCUDNNKernel
<
float
>
);
ops
::
SoftmaxGradCUDNNKernel
<
float
>
,
ops
::
SoftmaxGradCUDNNKernel
<
plat
::
float16
>
);
paddle/fluid/operators/softmax_op.cu.cc
浏览文件 @
32ca9fe7
...
...
@@ -23,4 +23,5 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
SoftmaxKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
softmax_grad
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
double
>
);
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/sum_op.cu
浏览文件 @
32ca9fe7
...
...
@@ -11,10 +11,13 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
sum
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/sum_op.h
浏览文件 @
32ca9fe7
...
...
@@ -46,7 +46,7 @@ class SumKernel : public framework::OpKernel<T> {
if
(
!
in_place
)
{
math
::
SetConstant
<
DeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
.
template
device_context
<
DeviceContext
>(),
out
,
0.0
);
static_cast
<
T
>
(
0
)
);
}
math
::
SelectedRowsAddToTensor
<
DeviceContext
,
T
>
functor
;
...
...
paddle/fluid/operators/top_k_op.cu
浏览文件 @
32ca9fe7
...
...
@@ -11,16 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <limits>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
paddle
::
platform
::
float16
;
template
<
typename
T
>
struct
Pair
{
...
...
@@ -32,6 +35,11 @@ struct Pair {
id
=
id
;
}
__device__
__forceinline__
void
clear
()
{
v
=
-
INFINITY
;
id
=
-
1
;
}
__device__
__forceinline__
void
operator
=
(
const
Pair
<
T
>&
in
)
{
v
=
in
.
v
;
id
=
in
.
id
;
...
...
@@ -53,6 +61,12 @@ struct Pair {
int64_t
id
;
};
template
<
>
__device__
__forceinline__
void
Pair
<
float16
>::
clear
()
{
v
=
platform
::
raw_uint16_to_float16
(
0x400
);
id
=
-
1
;
}
template
<
typename
T
>
__device__
__forceinline__
void
AddTo
(
Pair
<
T
>
topk
[],
const
Pair
<
T
>&
p
,
int
beam_size
)
{
...
...
@@ -150,7 +164,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if
(
k
<
MaxLength
-
(
*
beam
))
{
topk
[
k
]
=
topk
[
k
+
*
beam
];
}
else
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
topk
[
k
].
clear
(
);
}
}
if
(
!
(
*
is_empty
))
{
...
...
@@ -160,7 +174,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
}
*
max
=
topk
[
MaxLength
-
1
];
if
((
*
max
).
v
==
-
1
)
*
is_empty
=
true
;
if
((
*
max
).
v
==
static_cast
<
T
>
(
-
1
)
)
*
is_empty
=
true
;
*
beam
=
0
;
}
}
...
...
@@ -181,7 +195,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if
(
k
<
MaxLength
-
*
beam
)
{
topk
[
k
]
=
topk
[
k
+
*
beam
];
}
else
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
topk
[
k
].
set
(
std
::
numeric_limits
<
T
>::
min
()
,
-
1
);
}
}
if
(
!
(
*
is_empty
))
{
...
...
@@ -273,7 +287,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
bool
firststep
=
true
;
for
(
int
k
=
0
;
k
<
MaxLength
;
k
++
)
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
topk
[
k
].
clear
(
);
}
while
(
k
)
{
ThreadGetTopK
<
T
,
MaxLength
,
BlockSize
>
(
topk
,
&
beam
,
k
,
...
...
@@ -325,5 +339,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
}
// namespace operators
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
top_k
,
paddle
::
operators
::
TopkOpCUDAKernel
<
float
>
,
paddle
::
operators
::
TopkOpCUDAKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
top_k
,
paddle
::
operators
::
TopkOpCUDAKernel
<
float
>
,
paddle
::
operators
::
TopkOpCUDAKernel
<
double
>
,
paddle
::
operators
::
TopkOpCUDAKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/uniform_random_op.cu
浏览文件 @
32ca9fe7
...
...
@@ -11,10 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <thrust/random.h>
#include <thrust/transform.h>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/transform.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -36,6 +40,11 @@ struct UniformGenerator {
}
};
template
<
typename
T
,
typename
V
>
struct
CastFunctor
{
HOSTDEVICE
V
operator
()(
const
T
&
a
)
{
return
static_cast
<
V
>
(
a
);
}
};
// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
// Use std::random and thrust::random(thrust is a std library in CUDA) to
// implement uniform random.
...
...
@@ -66,18 +75,50 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
T
max
=
static_cast
<
T
>
(
context
.
Attr
<
float
>
(
"max"
));
thrust
::
counting_iterator
<
unsigned
int
>
index_sequence_begin
(
0
);
int64_t
size
=
tensor
->
numel
();
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
T
>
(
data
),
UniformGenerator
<
T
>
(
min
,
max
,
seed
));
if
(
out_var
->
IsType
<
framework
::
LoDTensor
>
()
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
framework
::
Tensor
master_copy_tensor
;
master_copy_tensor
.
Resize
(
tensor
->
dims
());
float
*
master_copy_tensor_data
=
master_copy_tensor
.
mutable_data
<
float
>
(
context
.
GetPlace
());
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
float
>
(
master_copy_tensor_data
),
UniformGenerator
<
float
>
(
static_cast
<
float
>
(
min
),
static_cast
<
float
>
(
max
),
seed
));
platform
::
Transform
<
platform
::
CUDADeviceContext
>
trans
;
auto
*
in_begin
=
master_copy_tensor
.
data
<
float
>
();
auto
*
in_end
=
in_begin
+
master_copy_tensor
.
numel
();
auto
*
out_begin
=
tensor
->
mutable_data
<
T
>
(
context
.
GetPlace
());
trans
(
context
.
template
device_context
<
platform
::
CUDADeviceContext
>(),
in_begin
,
in_end
,
out_begin
,
CastFunctor
<
float
,
T
>
());
}
else
{
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
T
>
(
data
),
UniformGenerator
<
T
>
(
min
,
max
,
seed
));
}
if
(
VLOG_IS_ON
(
5
))
{
framework
::
Tensor
cpu_tensor
;
framework
::
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
cpu_tensor
);
auto
&
dev_ctx
=
*
platform
::
DeviceContextPool
::
Instance
().
Get
(
context
.
GetPlace
());
dev_ctx
.
Wait
();
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
cpu_tensor
);
VLOG
(
5
)
<<
"The Uniform output "
<<
x
;
}
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
uniform_random
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
uniform_random_batch_size_like
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
);
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
uniform_random
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
uniform_random_batch_size_like
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
plat
::
float16
>
);
python/CMakeLists.txt
浏览文件 @
32ca9fe7
...
...
@@ -97,10 +97,11 @@ if(APPLE)
if
(
NOT INSTALL_NAME_TOOL_EXECUTABLE
)
message
(
FATAL_ERROR
"install_name_tool not found, please check.
\n
"
)
endif
()
else
(
APPLE
)
endif
()
if
(
LINUX
)
find_program
(
PATCHELF_EXECUTABLE patchelf
)
if
(
NOT PATCHELF_EXECUTABLE
)
message
(
FATAL_ERROR
"patchelf not found, please install it.
\n
"
"For Ubuntu, the command is: apt-get install -y patchelf."
)
endif
()
endif
(
APPLE
)
endif
(
LINUX
)
python/paddle/fluid/layers/nn.py
浏览文件 @
32ca9fe7
...
...
@@ -112,6 +112,7 @@ __all__ = [
'log'
,
'crop'
,
'rank_loss'
,
'prelu'
,
'flatten'
,
]
...
...
@@ -5089,7 +5090,7 @@ def random_crop(x, shape, seed=None):
return
out
def
log
(
x
):
def
log
(
x
,
name
=
None
):
"""
Calculates the natural log of the given input tensor, element-wise.
...
...
@@ -5099,6 +5100,8 @@ def log(x):
Args:
x (Variable): Input tensor.
name (str|None, default None): A name for this layer If set None,
the layer will be named automatically.
Returns:
Variable: The natural log of the input tensor computed element-wise.
...
...
@@ -5116,7 +5119,7 @@ def log(x):
return
out
def
relu
(
x
):
def
relu
(
x
,
name
=
None
):
"""
Relu takes one input data (Tensor) and produces one output data (Tensor)
where the rectified linear function, y = max(0, x), is applied to
...
...
@@ -5128,6 +5131,8 @@ def relu(x):
Args:
x (Variable): The input tensor.
name (str|None, default None): A name for this layer If set None,
the layer will be named automatically.
Returns:
Variable: The output tensor with the same shape as input.
...
...
@@ -5364,6 +5369,59 @@ def rank_loss(label, left, right, name=None):
return
out
def
prelu
(
x
,
mode
,
param_attr
=
None
,
name
=
None
):
"""
Equation:
y = \max(0, x) + alpha \min(0, x)
Args:
x (Variable): The input tensor.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight (alpha).
mode (string): The mode for weight sharing
all: all elements share same weight
channel:elements in a channel share same weight
element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The output tensor with the same shape as input.
Examples:
.. code-block:: python
x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
mode = 'channel'
output = fluid.layers.prelu(x,mode)
"""
helper
=
LayerHelper
(
'prelu'
,
**
locals
())
if
mode
not
in
[
'all'
,
'channel'
,
'element'
]:
raise
ValueError
(
'mode should be one of all, channel, element.'
)
alpha_shape
=
[
1
]
if
mode
==
'channel'
:
alpha_shape
=
[
1
,
x
.
shape
[
1
],
1
,
1
]
elif
mode
==
'element'
:
alpha_shape
=
x
.
shape
dtype
=
helper
.
input_dtype
(
input_param_name
=
'x'
)
alpha
=
helper
.
create_parameter
(
attr
=
param_attr
,
shape
=
alpha_shape
,
dtype
=
'float32'
,
is_bias
=
False
,
default_initializer
=
Constant
(
1.0
))
out
=
helper
.
create_tmp_variable
(
dtype
)
helper
.
append_op
(
type
=
"prelu"
,
inputs
=
{
"X"
:
x
,
'Alpha'
:
alpha
},
attrs
=
{
"mode"
:
mode
},
outputs
=
{
"Out"
:
out
})
return
out
def
flatten
(
x
,
axis
=
1
,
name
=
None
):
"""
**Flatten layer**
...
...
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
32ca9fe7
...
...
@@ -21,6 +21,7 @@ import paddle.fluid.nets as nets
from
paddle.fluid.framework
import
Program
,
program_guard
,
default_main_program
from
paddle.fluid.param_attr
import
ParamAttr
import
decorators
from
paddle.fluid.initializer
import
Constant
class
TestBook
(
unittest
.
TestCase
):
...
...
@@ -485,6 +486,20 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
def
test_prelu
(
self
):
program
=
Program
()
with
program_guard
(
program
):
input
=
layers
.
data
(
name
=
"input"
,
shape
=
[
5
,
200
,
100
,
100
],
dtype
=
"float32"
)
mode
=
'channel'
out
=
layers
.
prelu
(
input
,
mode
,
param_attr
=
ParamAttr
(
initializer
=
Constant
(
1.0
)),
name
=
'prelu'
)
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_prelu_op.py
浏览文件 @
32ca9fe7
...
...
@@ -20,30 +20,58 @@ from op_test import OpTest
class
PReluTest
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"prelu"
x_np
=
np
.
random
.
normal
(
size
=
(
10
,
10
)).
astype
(
"float32"
)
for
pos
,
val
in
np
.
ndenumerate
(
x_np
):
# Since zero point in prelu is not differentiable, avoid randomize
# zero.
while
abs
(
val
)
<
1e-3
:
x_np
[
pos
]
=
np
.
random
.
normal
()
val
=
x_np
[
pos
]
x_np_sign
=
np
.
sign
(
x_np
)
x_np
=
x_np_sign
*
np
.
maximum
(
x_np
,
.
005
)
alpha_np
=
np
.
array
([.
1
],
dtype
=
"float32"
)
self
.
inputs
=
{
'X'
:
x_np
,
'Alpha'
:
alpha_np
}
self
.
initTestCase
()
x_np
=
np
.
random
.
normal
(
size
=
(
3
,
5
,
5
,
10
)).
astype
(
"float32"
)
# Since zero point in prelu is not differentiable, avoid randomize
# zero.
x_np
[
np
.
abs
(
x_np
)
<
0.005
]
=
0.02
if
self
.
attrs
==
{
'mode'
:
"all"
}:
alpha_np
=
np
.
random
.
rand
(
1
).
astype
(
"float32"
)
self
.
inputs
=
{
'X'
:
x_np
,
'Alpha'
:
alpha_np
}
elif
self
.
attrs
==
{
'mode'
:
"channel"
}:
alpha_np
=
np
.
random
.
rand
(
1
,
x_np
.
shape
[
1
],
1
,
1
).
astype
(
"float32"
)
self
.
inputs
=
{
'X'
:
x_np
,
'Alpha'
:
alpha_np
}
else
:
alpha_np
=
np
.
random
.
rand
(
*
x_np
.
shape
).
astype
(
"float32"
)
self
.
inputs
=
{
'X'
:
x_np
,
'Alpha'
:
alpha_np
}
out_np
=
np
.
maximum
(
self
.
inputs
[
'X'
],
0.
)
out_np
=
out_np
+
np
.
minimum
(
self
.
inputs
[
'X'
],
0.
)
*
self
.
inputs
[
'Alpha'
]
assert
out_np
is
not
self
.
inputs
[
'X'
]
self
.
outputs
=
{
'Out'
:
out_np
}
def
initTestCase
(
self
):
self
.
attrs
=
{
'mode'
:
"channel"
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
self
.
check_grad
([
'X'
,
'Alpha'
],
'Out'
)
def
test_check_grad_ignore_x
(
self
):
self
.
check_grad
([
'Alpha'
],
'Out'
,
no_grad_set
=
set
(
'X'
))
def
test_check_grad_ignore_alpha
(
self
):
self
.
check_grad
([
'X'
],
'Out'
,
no_grad_set
=
set
(
'Alpha'
))
class
TestCase1
(
PReluTest
):
def
initTestCase
(
self
):
self
.
attrs
=
{
'mode'
:
"all"
}
class
TestCase2
(
PReluTest
):
def
initTestCase
(
self
):
self
.
attrs
=
{
'mode'
:
"channel"
}
class
TestCase3
(
PReluTest
):
def
initTestCase
(
self
):
self
.
attrs
=
{
'mode'
:
"element"
}
if
__name__
==
"__main__"
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录