Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
216c6218
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
216c6218
编写于
8月 21, 2018
作者:
Q
Qiyang Min
提交者:
GitHub
8月 21, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'release/0.15.0' into 015_for_dqn_local
上级
84f32915
ef9029db
变更
33
隐藏空白更改
内联
并排
Showing
33 changed file
with
121 addition
and
338 deletion
+121
-338
cmake/configure.cmake
cmake/configure.cmake
+4
-2
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+3
-3
paddle/fluid/operators/activation_op.cu
paddle/fluid/operators/activation_op.cu
+1
-3
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+6
-6
paddle/fluid/operators/assign_value_op.cu.cc
paddle/fluid/operators/assign_value_op.cu.cc
+1
-4
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+20
-36
paddle/fluid/operators/cross_entropy_op.cu
paddle/fluid/operators/cross_entropy_op.cu
+4
-8
paddle/fluid/operators/elementwise_add_op.cu
paddle/fluid/operators/elementwise_add_op.cu
+1
-2
paddle/fluid/operators/elementwise_div_op.cu
paddle/fluid/operators/elementwise_div_op.cu
+2
-7
paddle/fluid/operators/elementwise_mul_op.cu
paddle/fluid/operators/elementwise_mul_op.cu
+1
-7
paddle/fluid/operators/elementwise_op_function.h
paddle/fluid/operators/elementwise_op_function.h
+2
-2
paddle/fluid/operators/elementwise_sub_op.cu
paddle/fluid/operators/elementwise_sub_op.cu
+1
-7
paddle/fluid/operators/fill_constant_op.cc
paddle/fluid/operators/fill_constant_op.cc
+34
-19
paddle/fluid/operators/fill_constant_op.cu.cc
paddle/fluid/operators/fill_constant_op.cu.cc
+0
-26
paddle/fluid/operators/fill_constant_op.h
paddle/fluid/operators/fill_constant_op.h
+0
-48
paddle/fluid/operators/fill_op.cc
paddle/fluid/operators/fill_op.cc
+1
-1
paddle/fluid/operators/gaussian_random_op.cu
paddle/fluid/operators/gaussian_random_op.cu
+0
-2
paddle/fluid/operators/math/cross_entropy.cu
paddle/fluid/operators/math/cross_entropy.cu
+2
-18
paddle/fluid/operators/math/cross_entropy.h
paddle/fluid/operators/math/cross_entropy.h
+0
-17
paddle/fluid/operators/math/selected_rows_functor.cu
paddle/fluid/operators/math/selected_rows_functor.cu
+2
-11
paddle/fluid/operators/math/softmax.cu
paddle/fluid/operators/math/softmax.cu
+0
-3
paddle/fluid/operators/mean_op.cu
paddle/fluid/operators/mean_op.cu
+4
-6
paddle/fluid/operators/mean_op.h
paddle/fluid/operators/mean_op.h
+1
-1
paddle/fluid/operators/mul_op.cu.cc
paddle/fluid/operators/mul_op.cu.cc
+3
-4
paddle/fluid/operators/pool_cudnn_op.cu.cc
paddle/fluid/operators/pool_cudnn_op.cu.cc
+2
-4
paddle/fluid/operators/scale_op.cu
paddle/fluid/operators/scale_op.cu
+1
-5
paddle/fluid/operators/softmax_cudnn_op.cu.cc
paddle/fluid/operators/softmax_cudnn_op.cu.cc
+1
-2
paddle/fluid/operators/softmax_op.cu.cc
paddle/fluid/operators/softmax_op.cu.cc
+1
-2
paddle/fluid/operators/sum_op.cu
paddle/fluid/operators/sum_op.cu
+1
-4
paddle/fluid/operators/sum_op.h
paddle/fluid/operators/sum_op.h
+1
-1
paddle/fluid/operators/top_k_op.cu
paddle/fluid/operators/top_k_op.cu
+6
-22
paddle/fluid/operators/uniform_random_op.cu
paddle/fluid/operators/uniform_random_op.cu
+9
-50
python/paddle/fluid/trainer.py
python/paddle/fluid/trainer.py
+6
-5
未找到文件。
cmake/configure.cmake
浏览文件 @
216c6218
...
...
@@ -103,10 +103,12 @@ if(WITH_GPU)
endif
()
if
(
WITH_ANAKIN
)
if
(
${
CUDA_VERSION_MAJOR
}
VERSION_LESS 8
)
message
(
FATAL_ERROR
"Anakin needs CUDA >= 8.0 to compile"
)
message
(
WARNING
"Anakin needs CUDA >= 8.0 to compile. Force WITH_ANAKIN=OFF"
)
set
(
WITH_ANAKIN OFF CACHE STRING
"Anakin is valid only when CUDA >= 8.0."
FORCE
)
endif
()
if
(
${
CUDNN_MAJOR_VERSION
}
VERSION_LESS 7
)
message
(
FATAL_ERROR
"Anakin needs CUDNN >= 7.0 to compile"
)
message
(
WARNING
"Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF"
)
set
(
WITH_ANAKIN OFF CACHE STRING
"Anakin is valid only when CUDNN >= 7.0."
FORCE
)
endif
()
set
(
ENV{CUDNN_INCLUDE_DIR}
"
${
CUDNN_INCLUDE_DIR
}
/"
)
get_filename_component
(
CUDNN_LIBRARY_DIR
${
CUDNN_LIBRARY
}
DIRECTORY
)
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
216c6218
...
...
@@ -62,13 +62,13 @@ endif()
if
(
WITH_ANAKIN AND WITH_GPU
)
# only needed in CI
# compile the libinference_anakin_api.a and anakin.so.
nv
_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber
)
#nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin
)
cc
_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber
)
cc_library
(
inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber
)
function
(
anakin_target target_name
)
target_compile_options
(
${
target_name
}
BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
endfunction
()
anakin_target
(
inference_anakin_api
)
#
anakin_target(inference_anakin_api_shared)
anakin_target
(
inference_anakin_api_shared
)
if
(
WITH_TESTING
)
cc_test
(
inference_anakin_test SRCS api_anakin_engine_tester.cc
ARGS --model=
${
ANAKIN_SOURCE_DIR
}
/mobilenet_v2.anakin.bin
...
...
paddle/fluid/operators/activation_op.cu
浏览文件 @
216c6218
...
...
@@ -26,8 +26,6 @@ namespace plat = paddle::platform;
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>);
ops::grad_functor<double>>);
FOR_EACH_KERNEL_FUNCTOR
(
REGISTER_ACTIVATION_CUDA_KERNEL
);
paddle/fluid/operators/activation_op.h
浏览文件 @
216c6218
...
...
@@ -333,7 +333,8 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
static_cast
<
T
>
(
0.5
)
*
dout
/
out
;
const
Out
out_conj
=
Eigen
::
numext
::
conj
(
out
);
dx
.
device
(
d
)
=
static_cast
<
T
>
(
0.5
)
*
dout
/
out_conj
;
}
};
...
...
@@ -739,7 +740,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
static_cast
<
T
>
(
factor
)
*
x
.
pow
(
static_cast
<
T
>
(
factor
)
-
static_cast
<
T
>
(
1
));
x
.
pow
(
static_cast
<
T
>
(
factor
-
static_cast
<
T
>
(
1
)
));
}
};
...
...
@@ -862,11 +863,10 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
T
b
=
static_cast
<
T
>
(
beta
);
auto
temp1
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
(
static_cast
<
T
>
(
-
b
)
*
x
).
exp
());
auto
temp2
=
temp1
*
(
static_cast
<
T
>
(
1
)
-
(
b
*
out
));
dx
.
device
(
d
)
=
dout
*
((
b
*
out
)
+
temp2
);
(
static_cast
<
T
>
(
1
)
+
(
static_cast
<
T
>
(
-
b
eta
)
*
x
).
exp
());
auto
temp2
=
temp1
*
(
static_cast
<
T
>
(
1
)
-
(
b
eta
*
out
));
dx
.
device
(
d
)
=
dout
*
((
b
eta
*
out
)
+
temp2
);
}
};
...
...
paddle/fluid/operators/assign_value_op.cu.cc
浏览文件 @
216c6218
...
...
@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/assign_value_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
assign_value
,
ops
::
AssignValueKernel
<
int
>
,
ops
::
AssignValueKernel
<
float
>
,
ops
::
AssignValueKernel
<
plat
::
float16
>
);
ops
::
AssignValueKernel
<
float
>
);
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
216c6218
...
...
@@ -39,27 +39,6 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
static
constexpr
size_t
kCONV_CUDNN_WORKSPACE_LIMIT_BYTES
=
static_cast
<
size_t
>
(
1024
)
*
1024
*
1024
;
template
<
typename
T
,
typename
DeviceContext
>
// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
bool
EnableFp16
(
const
DeviceContext
&
dev_ctx
,
cudnnConvolutionDescriptor_t
cudnn_conv_desc
)
{
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
return
true
;
}
else
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
}
#endif
return
false
;
}
template
<
typename
T
>
class
CUDNNConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -149,14 +128,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionFwdAlgo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
if
(
EnableFp16
<
T
>
(
dev_ctx
,
cudnn_conv_desc
))
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
// Currently tensor core is only enabled using this algo
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
;
}
else
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
}
#endif
// get workspace size able to allocate
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
...
...
@@ -296,9 +288,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
else
{
data_algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
if
(
EnableFp16
<
T
>
(
dev_ctx
,
cudnn_conv_desc
))
{
data_algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
...
...
@@ -318,9 +307,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
else
{
filter_algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
}
if
(
EnableFp16
<
T
>
(
dev_ctx
,
cudnn_conv_desc
))
{
filter_algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
...
...
@@ -376,8 +362,7 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle
::
operators
::
CUDNNConvOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv2d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
plat
::
float16
>
);
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
);
REGISTER_OP_KERNEL
(
conv3d
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvOpKernel
<
float
>
,
...
...
@@ -385,5 +370,4 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle
::
operators
::
CUDNNConvOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv3d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
plat
::
float16
>
)
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
);
paddle/fluid/operators/cross_entropy_op.cu
浏览文件 @
216c6218
...
...
@@ -13,16 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
using
CUDACtx
=
paddle
::
platform
::
CUDADeviceContext
;
REGISTER_OP_CUDA_KERNEL
(
cross_entropy
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
double
>
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
cross_entropy_grad
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
double
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
plat
::
float16
>
);
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
cross_entropy_grad
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
double
>
);
paddle/fluid/operators/elementwise_add_op.cu
浏览文件 @
216c6218
...
...
@@ -30,5 +30,4 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/elementwise_div_op.cu
浏览文件 @
216c6218
...
...
@@ -14,24 +14,19 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_div_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
elementwise_div
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
elementwise_div_grad
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
int64_t
>
);
paddle/fluid/operators/elementwise_mul_op.cu
浏览文件 @
216c6218
...
...
@@ -14,25 +14,19 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
elementwise_mul
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
elementwise_mul_grad
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/elementwise_op_function.h
浏览文件 @
216c6218
...
...
@@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
int
j
=
blockIdx
.
x
;
int
i
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
T
val
(
0
)
;
T
val
=
0
;
do
{
int
x_offset
=
i
*
w
+
j
;
...
...
@@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
int
tid
=
threadIdx
.
x
;
int
j
=
blockIdx
.
x
;
T
val
(
0
)
;
T
val
=
0
;
int
ttid
=
tid
;
while
(
true
)
{
...
...
paddle/fluid/operators/elementwise_sub_op.cu
浏览文件 @
216c6218
...
...
@@ -14,25 +14,19 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_sub_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
elementwise_sub
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
elementwise_sub_grad
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/fill_constant_op.cc
浏览文件 @
216c6218
...
...
@@ -12,28 +12,48 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
operators
{
class
FillConstant
Op
:
public
framework
::
OperatorWithKernel
{
class
FillConstant
InferShape
:
public
framework
::
InferShapeBase
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of FillConstantOp should not be null."
);
auto
&
shape
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape"
);
auto
&
shape
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape"
);
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
shape
));
}
};
class
FillConstantOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
auto
data_type
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
Attr
<
int
>
(
"dtype"
));
auto
value
=
Attr
<
float
>
(
"value"
);
auto
force_cpu
=
Attr
<
bool
>
(
"force_cpu"
);
auto
&
out
=
*
scope
.
FindVar
(
Output
(
"Out"
))
->
GetMutable
<
framework
::
LoDTensor
>
();
out
.
Resize
(
framework
::
make_ddim
(
Attr
<
std
::
vector
<
int
>>
(
"shape"
)));
if
(
force_cpu
)
{
auto
cpu
=
platform
::
CPUPlace
();
out
.
mutable_data
(
cpu
,
framework
::
ToTypeIndex
(
data_type
));
}
else
{
out
.
mutable_data
(
dev_place
,
framework
::
ToTypeIndex
(
data_type
));
}
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
ctx
.
Attr
<
int
>
(
"dtype"
)),
ctx
.
device_context
());
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
math
::
set_constant
(
dev_ctx
,
&
out
,
value
);
}
};
...
...
@@ -67,11 +87,6 @@ Fill up a variable with specified constant value.
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fill_constant
,
ops
::
FillConstantOp
,
ops
::
FillConstantOpMaker
,
REGISTER_OPERATOR
(
fill_constant
,
ops
::
FillConstantOp
,
ops
::
FillConstantInferShape
,
ops
::
FillConstantOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
fill_constant
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
)
paddle/fluid/operators/fill_constant_op.cu.cc
已删除
100644 → 0
浏览文件 @
84f32915
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
fill_constant
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
)
paddle/fluid/operators/fill_constant_op.h
已删除
100644 → 0
浏览文件 @
84f32915
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
FillConstantOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
ctx
.
Attr
<
int
>
(
"dtype"
));
auto
value
=
ctx
.
Attr
<
float
>
(
"value"
);
auto
force_cpu
=
ctx
.
Attr
<
bool
>
(
"force_cpu"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
out
->
Resize
(
framework
::
make_ddim
(
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"shape"
)));
if
(
force_cpu
)
{
auto
cpu
=
platform
::
CPUPlace
();
out
->
mutable_data
(
cpu
,
framework
::
ToTypeIndex
(
data_type
));
}
else
{
out
->
mutable_data
(
ctx
.
GetPlace
(),
framework
::
ToTypeIndex
(
data_type
));
}
math
::
set_constant
(
ctx
.
template
device_context
<
DeviceContext
>(),
out
,
value
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fill_op.cc
浏览文件 @
216c6218
...
...
@@ -16,7 +16,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -70,6 +69,7 @@ class FillOp : public framework::OperatorBase {
framework
::
VisitDataType
(
dtype
,
FillOpVisitor
(
&
tensor
,
Attr
<
std
::
vector
<
float
>>
(
"value"
)));
if
(
!
force_cpu
&&
platform
::
is_gpu_place
(
place
))
{
// Copy tensor to out
platform
::
DeviceContextPool
&
pool
=
...
...
paddle/fluid/operators/gaussian_random_op.cu
浏览文件 @
216c6218
...
...
@@ -15,7 +15,6 @@ limitations under the License. */
#include <thrust/transform.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -61,7 +60,6 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
}
// namespace operators
}
// namespace paddle
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
gaussian_random
,
paddle
::
operators
::
GPUGaussianRandomKernel
<
float
>
,
paddle
::
operators
::
GPUGaussianRandomKernel
<
double
>
);
...
...
paddle/fluid/operators/math/cross_entropy.cu
浏览文件 @
216c6218
...
...
@@ -15,25 +15,11 @@ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
<
typename
T
>
HOSTDEVICE
T
log
(
const
T
&
val
)
{
return
std
::
log
(
val
);
}
template
<
>
HOSTDEVICE
platform
::
float16
log
(
const
platform
::
float16
&
val
)
{
// strage bug, hlog is not exists.
return
static_cast
<
float16
>
(
0
);
// half tmp = static_cast<half>(val);
// return static_cast<platform::float16>(hlog(tmp));
}
namespace
{
template
<
typename
T
>
__global__
void
CrossEntropyKernel
(
T
*
Y
,
const
T
*
X
,
const
int64_t
*
label
,
...
...
@@ -49,12 +35,12 @@ template <typename T>
__global__
void
SoftCrossEntropyKernel
(
T
*
Y
,
const
T
*
X
,
const
T
*
label
,
const
int
class_num
)
{
int
tid
=
threadIdx
.
x
;
T
val
(
0
)
;
T
val
=
0
;
int
idx
=
blockIdx
.
x
*
class_num
+
tid
;
int
end
=
blockIdx
.
x
*
class_num
+
class_num
;
for
(;
idx
<
end
;
idx
+=
blockDim
.
x
)
{
val
+=
math
::
TolerableValue
<
T
>
()(
log
(
X
[
idx
]))
*
label
[
idx
];
val
+=
math
::
TolerableValue
<
T
>
()(
std
::
log
(
X
[
idx
]))
*
label
[
idx
];
}
val
=
paddle
::
platform
::
reduceSum
(
val
,
tid
,
blockDim
.
x
);
...
...
@@ -98,8 +84,6 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/cross_entropy.h
浏览文件 @
216c6218
...
...
@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <limits>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace
paddle
{
...
...
@@ -35,21 +33,6 @@ struct TolerableValue {
}
};
// float16 value clip behave different.
using
paddle
::
platform
::
float16
;
using
paddle
::
platform
::
isfinite
;
template
<
>
struct
TolerableValue
<
float16
>
{
HOSTDEVICE
float16
operator
()(
const
float16
&
x
)
const
{
if
(
isfinite
(
x
))
return
x
;
else
if
(
x
>
static_cast
<
float16
>
(
0
))
return
std
::
numeric_limits
<
float16
>::
max
();
else
return
std
::
numeric_limits
<
float16
>::
min
();
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
CrossEntropyFunctor
{
public:
...
...
paddle/fluid/operators/math/selected_rows_functor.cu
浏览文件 @
216c6218
...
...
@@ -18,7 +18,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -77,7 +76,6 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
{
template
<
typename
T
,
int
block_size
>
...
...
@@ -122,7 +120,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
auto
*
out_data
=
output
->
data
<
T
>
();
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
functor
;
functor
(
context
,
output
,
static_cast
<
T
>
(
0
)
);
functor
(
context
,
output
,
0.0
);
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
...
...
@@ -140,8 +138,6 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
<
typename
T
>
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
T
>
{
...
...
@@ -181,8 +177,6 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
{
template
<
typename
T
,
int
block_size
>
...
...
@@ -235,8 +229,6 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
scatter
{
...
...
@@ -284,7 +276,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
,
out
.
mutable_value
(),
static_cast
<
T
>
(
0
)
);
constant_functor
(
context
,
out
.
mutable_value
(),
0.0
);
auto
*
out_data
=
out
.
mutable_value
()
->
data
<
T
>
();
auto
*
input_data
=
input
.
value
().
data
<
T
>
();
...
...
@@ -308,7 +300,6 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
<
typename
T
,
int
block_size
>
__global__
void
UpdateToTensorKernel
(
const
T
*
selected_rows
,
...
...
paddle/fluid/operators/math/softmax.cu
浏览文件 @
216c6218
...
...
@@ -94,15 +94,12 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
>;
template
class
SoftmaxCUDNNFunctor
<
float
>;
template
class
SoftmaxCUDNNFunctor
<
double
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
...
...
paddle/fluid/operators/mean_op.cu
浏览文件 @
216c6218
...
...
@@ -12,16 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/mean_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
mean
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
mean_grad
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/mean_op.h
浏览文件 @
216c6218
...
...
@@ -55,7 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
IG
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
ig_size
=
static_cast
<
T
>
(
IG
->
numel
());
Eigen
::
DSizes
<
int
,
1
>
bcast
(
static_cast
<
int
>
(
ig_size
)
);
Eigen
::
DSizes
<
int
,
1
>
bcast
(
ig_size
);
EigenVector
<
T
>::
Flatten
(
*
IG
).
device
(
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
())
=
...
...
paddle/fluid/operators/mul_op.cu.cc
浏览文件 @
216c6218
...
...
@@ -20,7 +20,6 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL
(
mul
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
mul_grad
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
mul_grad
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/pool_cudnn_op.cu.cc
浏览文件 @
216c6218
...
...
@@ -174,8 +174,7 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
ops
::
PoolCUDNNOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool2d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNGradOpKernel
<
float
>
,
ops
::
PoolCUDNNGradOpKernel
<
double
>
,
ops
::
PoolCUDNNGradOpKernel
<
plat
::
float16
>
);
ops
::
PoolCUDNNGradOpKernel
<
double
>
);
REGISTER_OP_KERNEL
(
pool3d
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNOpKernel
<
float
>
,
...
...
@@ -183,5 +182,4 @@ REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
ops
::
PoolCUDNNOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool3d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNGradOpKernel
<
float
>
,
ops
::
PoolCUDNNGradOpKernel
<
double
>
,
ops
::
PoolCUDNNGradOpKernel
<
plat
::
float16
>
);
ops
::
PoolCUDNNGradOpKernel
<
double
>
);
paddle/fluid/operators/scale_op.cu
浏览文件 @
216c6218
...
...
@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/scale_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
scale
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
int64_t
>
);
paddle/fluid/operators/softmax_cudnn_op.cu.cc
浏览文件 @
216c6218
...
...
@@ -78,5 +78,4 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
ops
::
SoftmaxCUDNNKernel
<
float
>
,
ops
::
SoftmaxCUDNNKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
softmax_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
SoftmaxGradCUDNNKernel
<
float
>
,
ops
::
SoftmaxGradCUDNNKernel
<
plat
::
float16
>
);
ops
::
SoftmaxGradCUDNNKernel
<
float
>
);
paddle/fluid/operators/softmax_op.cu.cc
浏览文件 @
216c6218
...
...
@@ -23,5 +23,4 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
SoftmaxKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
softmax_grad
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/sum_op.cu
浏览文件 @
216c6218
...
...
@@ -11,13 +11,10 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
sum
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/sum_op.h
浏览文件 @
216c6218
...
...
@@ -46,7 +46,7 @@ class SumKernel : public framework::OpKernel<T> {
if
(
!
in_place
)
{
math
::
SetConstant
<
DeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
.
template
device_context
<
DeviceContext
>(),
out
,
static_cast
<
T
>
(
0
)
);
0.0
);
}
math
::
SelectedRowsAddToTensor
<
DeviceContext
,
T
>
functor
;
...
...
paddle/fluid/operators/top_k_op.cu
浏览文件 @
216c6218
...
...
@@ -11,19 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <limits>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
paddle
::
platform
::
float16
;
template
<
typename
T
>
struct
Pair
{
...
...
@@ -35,11 +32,6 @@ struct Pair {
id
=
id
;
}
__device__
__forceinline__
void
clear
()
{
v
=
-
INFINITY
;
id
=
-
1
;
}
__device__
__forceinline__
void
operator
=
(
const
Pair
<
T
>&
in
)
{
v
=
in
.
v
;
id
=
in
.
id
;
...
...
@@ -61,12 +53,6 @@ struct Pair {
int64_t
id
;
};
template
<
>
__device__
__forceinline__
void
Pair
<
float16
>::
clear
()
{
v
=
platform
::
raw_uint16_to_float16
(
0x400
);
id
=
-
1
;
}
template
<
typename
T
>
__device__
__forceinline__
void
AddTo
(
Pair
<
T
>
topk
[],
const
Pair
<
T
>&
p
,
int
beam_size
)
{
...
...
@@ -164,7 +150,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if
(
k
<
MaxLength
-
(
*
beam
))
{
topk
[
k
]
=
topk
[
k
+
*
beam
];
}
else
{
topk
[
k
].
clear
(
);
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
}
}
if
(
!
(
*
is_empty
))
{
...
...
@@ -174,7 +160,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
}
*
max
=
topk
[
MaxLength
-
1
];
if
((
*
max
).
v
==
static_cast
<
T
>
(
-
1
)
)
*
is_empty
=
true
;
if
((
*
max
).
v
==
-
1
)
*
is_empty
=
true
;
*
beam
=
0
;
}
}
...
...
@@ -195,7 +181,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if
(
k
<
MaxLength
-
*
beam
)
{
topk
[
k
]
=
topk
[
k
+
*
beam
];
}
else
{
topk
[
k
].
set
(
std
::
numeric_limits
<
T
>::
min
()
,
-
1
);
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
}
}
if
(
!
(
*
is_empty
))
{
...
...
@@ -287,7 +273,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
bool
firststep
=
true
;
for
(
int
k
=
0
;
k
<
MaxLength
;
k
++
)
{
topk
[
k
].
clear
(
);
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
}
while
(
k
)
{
ThreadGetTopK
<
T
,
MaxLength
,
BlockSize
>
(
topk
,
&
beam
,
k
,
...
...
@@ -339,7 +325,5 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
}
// namespace operators
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
top_k
,
paddle
::
operators
::
TopkOpCUDAKernel
<
float
>
,
paddle
::
operators
::
TopkOpCUDAKernel
<
double
>
,
paddle
::
operators
::
TopkOpCUDAKernel
<
paddle
::
platform
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
top_k
,
paddle
::
operators
::
TopkOpCUDAKernel
<
float
>
,
paddle
::
operators
::
TopkOpCUDAKernel
<
double
>
);
paddle/fluid/operators/uniform_random_op.cu
浏览文件 @
216c6218
...
...
@@ -11,14 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <thrust/random.h>
#include <thrust/transform.h>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/transform.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -40,11 +36,6 @@ struct UniformGenerator {
}
};
template
<
typename
T
,
typename
V
>
struct
CastFunctor
{
HOSTDEVICE
V
operator
()(
const
T
&
a
)
{
return
static_cast
<
V
>
(
a
);
}
};
// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
// Use std::random and thrust::random(thrust is a std library in CUDA) to
// implement uniform random.
...
...
@@ -75,50 +66,18 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
T
max
=
static_cast
<
T
>
(
context
.
Attr
<
float
>
(
"max"
));
thrust
::
counting_iterator
<
unsigned
int
>
index_sequence_begin
(
0
);
int64_t
size
=
tensor
->
numel
();
if
(
out_var
->
IsType
<
framework
::
LoDTensor
>
()
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
framework
::
Tensor
master_copy_tensor
;
master_copy_tensor
.
Resize
(
tensor
->
dims
());
float
*
master_copy_tensor_data
=
master_copy_tensor
.
mutable_data
<
float
>
(
context
.
GetPlace
());
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
float
>
(
master_copy_tensor_data
),
UniformGenerator
<
float
>
(
static_cast
<
float
>
(
min
),
static_cast
<
float
>
(
max
),
seed
));
platform
::
Transform
<
platform
::
CUDADeviceContext
>
trans
;
auto
*
in_begin
=
master_copy_tensor
.
data
<
float
>
();
auto
*
in_end
=
in_begin
+
master_copy_tensor
.
numel
();
auto
*
out_begin
=
tensor
->
mutable_data
<
T
>
(
context
.
GetPlace
());
trans
(
context
.
template
device_context
<
platform
::
CUDADeviceContext
>(),
in_begin
,
in_end
,
out_begin
,
CastFunctor
<
float
,
T
>
());
}
else
{
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
T
>
(
data
),
UniformGenerator
<
T
>
(
min
,
max
,
seed
));
}
if
(
VLOG_IS_ON
(
5
))
{
framework
::
Tensor
cpu_tensor
;
framework
::
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
cpu_tensor
);
auto
&
dev_ctx
=
*
platform
::
DeviceContextPool
::
Instance
().
Get
(
context
.
GetPlace
());
dev_ctx
.
Wait
();
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
cpu_tensor
);
VLOG
(
5
)
<<
"The Uniform output "
<<
x
;
}
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
T
>
(
data
),
UniformGenerator
<
T
>
(
min
,
max
,
seed
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
uniform_random
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
uniform_random_batch_size_like
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
uniform_random
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
uniform_random_batch_size_like
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
);
python/paddle/fluid/trainer.py
浏览文件 @
216c6218
...
...
@@ -285,11 +285,12 @@ class Trainer(object):
self
.
_load_checkpoint
()
if
param_path
and
os
.
path
.
isdir
(
param_path
):
# load params from param_path into scope
io
.
load_persistables
(
executor
=
exe
,
dirname
=
param_path
,
main_program
=
self
.
startup_program
)
with
self
.
_prog_and_scope_guard
():
# load params from param_path into scope
io
.
load_persistables
(
executor
=
exe
,
dirname
=
param_path
,
main_program
=
self
.
startup_program
)
def
_transpile_nccl2_dist
(
self
):
# PADDLE_TRAINER_IPS
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录