Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
a72f68f2
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a72f68f2
编写于
8月 16, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'ups/develop' into feature/op/fusion_lstm
上级
df28a3b4
17da113c
变更
51
隐藏空白更改
内联
并排
Showing
51 changed file
with
746 addition
and
251 deletion
+746
-251
CMakeLists.txt
CMakeLists.txt
+2
-1
cmake/configure.cmake
cmake/configure.cmake
+4
-0
cmake/external/anakin.cmake
cmake/external/anakin.cmake
+2
-3
paddle/fluid/API.spec
paddle/fluid/API.spec
+3
-2
paddle/fluid/framework/threadpool.cc
paddle/fluid/framework/threadpool.cc
+7
-0
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+1
-1
paddle/fluid/operators/activation_op.cu
paddle/fluid/operators/activation_op.cu
+3
-1
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+6
-6
paddle/fluid/operators/assign_value_op.cu.cc
paddle/fluid/operators/assign_value_op.cu.cc
+4
-1
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+36
-20
paddle/fluid/operators/cross_entropy_op.cu
paddle/fluid/operators/cross_entropy_op.cu
+8
-4
paddle/fluid/operators/distributed/variable_response.cc
paddle/fluid/operators/distributed/variable_response.cc
+6
-1
paddle/fluid/operators/elementwise_add_op.cu
paddle/fluid/operators/elementwise_add_op.cu
+2
-1
paddle/fluid/operators/elementwise_div_op.cu
paddle/fluid/operators/elementwise_div_op.cu
+7
-2
paddle/fluid/operators/elementwise_mul_op.cu
paddle/fluid/operators/elementwise_mul_op.cu
+7
-1
paddle/fluid/operators/elementwise_op_function.h
paddle/fluid/operators/elementwise_op_function.h
+2
-2
paddle/fluid/operators/elementwise_sub_op.cu
paddle/fluid/operators/elementwise_sub_op.cu
+7
-1
paddle/fluid/operators/fill_constant_op.cc
paddle/fluid/operators/fill_constant_op.cc
+19
-34
paddle/fluid/operators/fill_constant_op.cu.cc
paddle/fluid/operators/fill_constant_op.cu.cc
+26
-0
paddle/fluid/operators/fill_constant_op.h
paddle/fluid/operators/fill_constant_op.h
+48
-0
paddle/fluid/operators/fill_op.cc
paddle/fluid/operators/fill_op.cc
+1
-1
paddle/fluid/operators/gaussian_random_op.cu
paddle/fluid/operators/gaussian_random_op.cu
+2
-0
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+4
-1
paddle/fluid/operators/math/cross_entropy.cu
paddle/fluid/operators/math/cross_entropy.cu
+18
-2
paddle/fluid/operators/math/cross_entropy.h
paddle/fluid/operators/math/cross_entropy.h
+17
-0
paddle/fluid/operators/math/selected_rows_functor.cu
paddle/fluid/operators/math/selected_rows_functor.cu
+11
-2
paddle/fluid/operators/math/softmax.cu
paddle/fluid/operators/math/softmax.cu
+3
-0
paddle/fluid/operators/mean_op.cu
paddle/fluid/operators/mean_op.cu
+6
-4
paddle/fluid/operators/mean_op.h
paddle/fluid/operators/mean_op.h
+1
-1
paddle/fluid/operators/mul_op.cu.cc
paddle/fluid/operators/mul_op.cu.cc
+4
-3
paddle/fluid/operators/pool_cudnn_op.cu.cc
paddle/fluid/operators/pool_cudnn_op.cu.cc
+4
-2
paddle/fluid/operators/prelu_op.cc
paddle/fluid/operators/prelu_op.cc
+52
-13
paddle/fluid/operators/prelu_op.cu
paddle/fluid/operators/prelu_op.cu
+0
-22
paddle/fluid/operators/prelu_op.h
paddle/fluid/operators/prelu_op.h
+73
-52
paddle/fluid/operators/scale_op.cu
paddle/fluid/operators/scale_op.cu
+5
-1
paddle/fluid/operators/scatter_op.h
paddle/fluid/operators/scatter_op.h
+2
-2
paddle/fluid/operators/softmax_cudnn_op.cu.cc
paddle/fluid/operators/softmax_cudnn_op.cu.cc
+2
-1
paddle/fluid/operators/softmax_op.cu.cc
paddle/fluid/operators/softmax_op.cu.cc
+2
-1
paddle/fluid/operators/sum_op.cu
paddle/fluid/operators/sum_op.cu
+4
-1
paddle/fluid/operators/sum_op.h
paddle/fluid/operators/sum_op.h
+1
-1
paddle/fluid/operators/top_k_op.cu
paddle/fluid/operators/top_k_op.cu
+22
-6
paddle/fluid/operators/uniform_random_op.cu
paddle/fluid/operators/uniform_random_op.cu
+50
-9
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-1
python/paddle/fluid/initializer.py
python/paddle/fluid/initializer.py
+0
-1
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+60
-2
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-2
python/paddle/fluid/tests/unittests/test_dist_train.py
python/paddle/fluid/tests/unittests/test_dist_train.py
+17
-0
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+37
-13
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+15
-0
python/paddle/fluid/tests/unittests/test_prelu_op.py
python/paddle/fluid/tests/unittests/test_prelu_op.py
+42
-14
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+88
-12
未找到文件。
CMakeLists.txt
浏览文件 @
a72f68f2
...
...
@@ -204,11 +204,12 @@ include(external/snappy) # download snappy
include
(
external/snappystream
)
include
(
external/threadpool
)
set
(
WITH_ANAKIN OFF CACHE STRING
"Disable Anakin first, will add it later."
FORCE
)
if
(
WITH_GPU
)
include
(
cuda
)
include
(
tensorrt
)
include
(
external/anakin
)
elseif
()
set
(
WITH_ANAKIN OFF CACHE STRING
"Anakin is used in GPU only now."
FORCE
)
endif
()
include
(
cudnn
)
# set cudnn libraries, must before configure
...
...
cmake/configure.cmake
浏览文件 @
a72f68f2
...
...
@@ -104,6 +104,10 @@ if(WITH_GPU)
if
(
${
CUDNN_MAJOR_VERSION
}
VERSION_LESS 7
)
message
(
FATAL_ERROR
"Anakin needs CUDNN >= 7.0 to compile"
)
endif
()
set
(
ENV{CUDNN_INCLUDE_DIR}
${
CUDNN_INCLUDE_DIR
}
)
set
(
ENV{CUDNN_LIBRARY}
${
CUDNN_LIBRARY
}
)
message
(
STATUS
"cudnn include header is
${
CUDNN_INCLUDE_DIR
}
/cudnn.h"
)
message
(
STATUS
"cudnn library is
${
CUDNN_LIBRARY
}
"
)
endif
()
elseif
(
WITH_AMD_GPU
)
add_definitions
(
-DPADDLE_WITH_HIP
)
...
...
cmake/external/anakin.cmake
浏览文件 @
a72f68f2
...
...
@@ -35,9 +35,8 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
ExternalProject_Add
(
extern_anakin
${
EXTERNAL_PROJECT_LOG_ARGS
}
# TODO(luotao): use PaddlePaddle/Anakin later
GIT_REPOSITORY
"https://github.com/luotao1/Anakin"
GIT_TAG
"3957ae9263eaa0b1986758dac60a88852afb09be"
GIT_REPOSITORY
"https://github.com/PaddlePaddle/Anakin"
GIT_TAG
"04256ba78fa3da0beb74e8036c8efd68c12824d6"
PREFIX
${
ANAKIN_SOURCE_DIR
}
UPDATE_COMMAND
""
CMAKE_ARGS -DUSE_GPU_PLACE=YES
...
...
paddle/fluid/API.spec
浏览文件 @
a72f68f2
...
...
@@ -155,10 +155,11 @@ paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale',
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.relu ArgSpec(args=['x'
], varargs=None, keywords=None, defaults=None
)
paddle.fluid.layers.log ArgSpec(args=['x'
], varargs=None, keywords=None, defaults=None
)
paddle.fluid.layers.relu ArgSpec(args=['x'
, 'name'], varargs=None, keywords=None, defaults=(None,)
)
paddle.fluid.layers.log ArgSpec(args=['x'
, 'name'], varargs=None, keywords=None, defaults=(None,)
)
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
...
...
paddle/fluid/framework/threadpool.cc
浏览文件 @
a72f68f2
...
...
@@ -20,6 +20,9 @@
DEFINE_int32
(
io_threadpool_size
,
100
,
"number of threads used for doing IO, default 100"
);
DEFINE_int32
(
dist_threadpool_size
,
0
,
"number of threads used for distributed executed."
);
namespace
paddle
{
namespace
framework
{
...
...
@@ -35,6 +38,10 @@ void ThreadPool::Init() {
if
(
threadpool_
.
get
()
==
nullptr
)
{
// TODO(Yancey1989): specify the max threads number
int
num_threads
=
std
::
thread
::
hardware_concurrency
();
if
(
FLAGS_dist_threadpool_size
>
0
)
{
num_threads
=
FLAGS_dist_threadpool_size
;
VLOG
(
1
)
<<
"set dist_threadpool_size to "
<<
num_threads
;
}
PADDLE_ENFORCE_GT
(
num_threads
,
0
);
threadpool_
.
reset
(
new
ThreadPool
(
num_threads
));
}
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
a72f68f2
...
...
@@ -60,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
inference_api_test
(
test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec
)
endif
()
if
(
WITH_ANAKIN
)
# only needed in CI
if
(
WITH_ANAKIN
AND WITH_GPU
)
# only needed in CI
# compile the libinference_anakin_api.a and anakin.so.
nv_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber
)
#nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
...
...
paddle/fluid/operators/activation_op.cu
浏览文件 @
a72f68f2
...
...
@@ -26,6 +26,8 @@ namespace plat = paddle::platform;
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>);
ops::grad_functor<double>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>);
FOR_EACH_KERNEL_FUNCTOR
(
REGISTER_ACTIVATION_CUDA_KERNEL
);
paddle/fluid/operators/activation_op.h
浏览文件 @
a72f68f2
...
...
@@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
const
Out
out_conj
=
Eigen
::
numext
::
conj
(
out
);
dx
.
device
(
d
)
=
static_cast
<
T
>
(
0.5
)
*
dout
/
out_conj
;
dx
.
device
(
d
)
=
static_cast
<
T
>
(
0.5
)
*
dout
/
out
;
}
};
...
...
@@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
static_cast
<
T
>
(
factor
)
*
x
.
pow
(
static_cast
<
T
>
(
factor
-
static_cast
<
T
>
(
1
)
));
x
.
pow
(
static_cast
<
T
>
(
factor
)
-
static_cast
<
T
>
(
1
));
}
};
...
...
@@ -863,10 +862,11 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
T
b
=
static_cast
<
T
>
(
beta
);
auto
temp1
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
(
static_cast
<
T
>
(
-
b
eta
)
*
x
).
exp
());
auto
temp2
=
temp1
*
(
static_cast
<
T
>
(
1
)
-
(
b
eta
*
out
));
dx
.
device
(
d
)
=
dout
*
((
b
eta
*
out
)
+
temp2
);
(
static_cast
<
T
>
(
1
)
+
(
static_cast
<
T
>
(
-
b
)
*
x
).
exp
());
auto
temp2
=
temp1
*
(
static_cast
<
T
>
(
1
)
-
(
b
*
out
));
dx
.
device
(
d
)
=
dout
*
((
b
*
out
)
+
temp2
);
}
};
...
...
paddle/fluid/operators/assign_value_op.cu.cc
浏览文件 @
a72f68f2
...
...
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/assign_value_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
assign_value
,
ops
::
AssignValueKernel
<
int
>
,
ops
::
AssignValueKernel
<
float
>
);
ops
::
AssignValueKernel
<
float
>
,
ops
::
AssignValueKernel
<
plat
::
float16
>
);
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
a72f68f2
...
...
@@ -39,6 +39,27 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
static
constexpr
size_t
kCONV_CUDNN_WORKSPACE_LIMIT_BYTES
=
static_cast
<
size_t
>
(
1024
)
*
1024
*
1024
;
template
<
typename
T
,
typename
DeviceContext
>
// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
bool
EnableFp16
(
const
DeviceContext
&
dev_ctx
,
cudnnConvolutionDescriptor_t
cudnn_conv_desc
)
{
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
return
true
;
}
else
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
}
#endif
return
false
;
}
template
<
typename
T
>
class
CUDNNConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -128,27 +149,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionFwdAlgo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
// Currently tensor core is only enabled using this algo
if
(
EnableFp16
<
T
>
(
dev_ctx
,
cudnn_conv_desc
))
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
;
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
}
#endif
// get workspace size able to allocate
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
...
...
@@ -288,6 +296,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
else
{
data_algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
if
(
EnableFp16
<
T
>
(
dev_ctx
,
cudnn_conv_desc
))
{
data_algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
...
...
@@ -307,6 +318,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
else
{
filter_algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
}
if
(
EnableFp16
<
T
>
(
dev_ctx
,
cudnn_conv_desc
))
{
filter_algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
...
...
@@ -362,7 +376,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle
::
operators
::
CUDNNConvOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv2d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
);
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv3d
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvOpKernel
<
float
>
,
...
...
@@ -370,4 +385,5 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle
::
operators
::
CUDNNConvOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv3d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
);
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
plat
::
float16
>
)
paddle/fluid/operators/cross_entropy_op.cu
浏览文件 @
a72f68f2
...
...
@@ -13,12 +13,16 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
using
CUDACtx
=
paddle
::
platform
::
CUDADeviceContext
;
REGISTER_OP_CUDA_KERNEL
(
cross_entropy
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
cross_entropy_grad
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
double
>
);
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
double
>
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
cross_entropy_grad
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
double
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
plat
::
float16
>
);
paddle/fluid/operators/distributed/variable_response.cc
浏览文件 @
a72f68f2
...
...
@@ -190,12 +190,15 @@ bool VariableResponse::ProcSerializedField(
#endif
}
VLOG
(
7
)
<<
"ProcSerializedField:"
<<
meta_
.
varname
()
<<
", type:"
<<
meta_
.
type
()
<<
std
::
endl
;
framework
::
DDim
dims
=
GetDims
(
meta_
.
dims
());
if
(
meta_
.
type
()
==
sendrecv
::
LOD_TENSOR
)
{
PADDLE_ENFORCE
(
meta_
.
lod_size
()
>=
0
,
"lod info should be got first!"
);
if
(
!
CopyLodTensorData
(
input
,
*
dev_ctx_
,
dims
,
num_bytes
))
{
return
false
;
}
return
true
;
}
...
...
@@ -206,7 +209,9 @@ bool VariableResponse::ProcSerializedField(
return
true
;
}
return
true
;
PADDLE_ENFORCE
(
"not supported var types:"
,
meta_
.
varname
(),
meta_
.
type
());
return
false
;
}
};
// namespace distributed
...
...
paddle/fluid/operators/elementwise_add_op.cu
浏览文件 @
a72f68f2
...
...
@@ -30,4 +30,5 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/elementwise_div_op.cu
浏览文件 @
a72f68f2
...
...
@@ -14,19 +14,24 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_div_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
elementwise_div
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
elementwise_div_grad
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
plat
::
float16
>
);
paddle/fluid/operators/elementwise_mul_op.cu
浏览文件 @
a72f68f2
...
...
@@ -14,19 +14,25 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
elementwise_mul
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
elementwise_mul_grad
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/elementwise_op_function.h
浏览文件 @
a72f68f2
...
...
@@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
int
j
=
blockIdx
.
x
;
int
i
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
T
val
=
0
;
T
val
(
0
)
;
do
{
int
x_offset
=
i
*
w
+
j
;
...
...
@@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
int
tid
=
threadIdx
.
x
;
int
j
=
blockIdx
.
x
;
T
val
=
0
;
T
val
(
0
)
;
int
ttid
=
tid
;
while
(
true
)
{
...
...
paddle/fluid/operators/elementwise_sub_op.cu
浏览文件 @
a72f68f2
...
...
@@ -14,19 +14,25 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_sub_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
elementwise_sub
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
elementwise_sub_grad
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/fill_constant_op.cc
浏览文件 @
a72f68f2
...
...
@@ -12,48 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
class
FillConstant
InferShape
:
public
framework
::
InferShapeBase
{
class
FillConstant
Op
:
public
framework
::
OperatorWithKernel
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of FillConstantOp should not be null."
);
auto
&
shape
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape"
);
auto
&
shape
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape"
);
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
shape
));
}
};
class
FillConstantOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
auto
data_type
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
Attr
<
int
>
(
"dtype"
));
auto
value
=
Attr
<
float
>
(
"value"
);
auto
force_cpu
=
Attr
<
bool
>
(
"force_cpu"
);
auto
&
out
=
*
scope
.
FindVar
(
Output
(
"Out"
))
->
GetMutable
<
framework
::
LoDTensor
>
();
out
.
Resize
(
framework
::
make_ddim
(
Attr
<
std
::
vector
<
int
>>
(
"shape"
)));
if
(
force_cpu
)
{
auto
cpu
=
platform
::
CPUPlace
();
out
.
mutable_data
(
cpu
,
framework
::
ToTypeIndex
(
data_type
));
}
else
{
out
.
mutable_data
(
dev_place
,
framework
::
ToTypeIndex
(
data_type
));
}
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
math
::
set_constant
(
dev_ctx
,
&
out
,
value
);
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
ctx
.
Attr
<
int
>
(
"dtype"
)),
ctx
.
device_context
());
}
};
...
...
@@ -87,6 +67,11 @@ Fill up a variable with specified constant value.
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fill_constant
,
ops
::
FillConstantOp
,
ops
::
FillConstantInferShape
,
ops
::
FillConstantOpMaker
,
REGISTER_OPERATOR
(
fill_constant
,
ops
::
FillConstantOp
,
ops
::
FillConstantOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
fill_constant
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
)
paddle/fluid/operators/fill_constant_op.cu.cc
0 → 100644
浏览文件 @
a72f68f2
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
fill_constant
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
)
paddle/fluid/operators/fill_constant_op.h
0 → 100644
浏览文件 @
a72f68f2
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
FillConstantOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
ctx
.
Attr
<
int
>
(
"dtype"
));
auto
value
=
ctx
.
Attr
<
float
>
(
"value"
);
auto
force_cpu
=
ctx
.
Attr
<
bool
>
(
"force_cpu"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
out
->
Resize
(
framework
::
make_ddim
(
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"shape"
)));
if
(
force_cpu
)
{
auto
cpu
=
platform
::
CPUPlace
();
out
->
mutable_data
(
cpu
,
framework
::
ToTypeIndex
(
data_type
));
}
else
{
out
->
mutable_data
(
ctx
.
GetPlace
(),
framework
::
ToTypeIndex
(
data_type
));
}
math
::
set_constant
(
ctx
.
template
device_context
<
DeviceContext
>(),
out
,
value
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fill_op.cc
浏览文件 @
a72f68f2
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -69,7 +70,6 @@ class FillOp : public framework::OperatorBase {
framework
::
VisitDataType
(
dtype
,
FillOpVisitor
(
&
tensor
,
Attr
<
std
::
vector
<
float
>>
(
"value"
)));
if
(
!
force_cpu
&&
platform
::
is_gpu_place
(
place
))
{
// Copy tensor to out
platform
::
DeviceContextPool
&
pool
=
...
...
paddle/fluid/operators/gaussian_random_op.cu
浏览文件 @
a72f68f2
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#include <thrust/transform.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -60,6 +61,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
}
// namespace operators
}
// namespace paddle
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
gaussian_random
,
paddle
::
operators
::
GPUGaussianRandomKernel
<
float
>
,
paddle
::
operators
::
GPUGaussianRandomKernel
<
double
>
);
...
...
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
a72f68f2
...
...
@@ -123,8 +123,11 @@ void ListenAndServOp::RunSyncLoop(
optimize_prepared
.
begin
(),
std
::
shared_ptr
<
framework
::
ExecutorPrepareContext
>
(
nullptr
));
// Trainers will get all parameters from pserver in the
// startup program, so we will wait RequestGet first
rpc_service_
->
SetCond
(
distributed
::
kRequestGet
);
rpc_service_
->
WaitBarrier
(
distributed
::
kRequestGet
);
rpc_service_
->
ResetBarrierCounter
();
while
(
true
)
{
rpc_service_
->
Profiler
().
OneStep
();
// Get from multiple trainers, we don't care about the order in which
...
...
paddle/fluid/operators/math/cross_entropy.cu
浏览文件 @
a72f68f2
...
...
@@ -15,11 +15,25 @@ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
<
typename
T
>
HOSTDEVICE
T
log
(
const
T
&
val
)
{
return
std
::
log
(
val
);
}
template
<
>
HOSTDEVICE
platform
::
float16
log
(
const
platform
::
float16
&
val
)
{
// strage bug, hlog is not exists.
return
static_cast
<
float16
>
(
0
);
// half tmp = static_cast<half>(val);
// return static_cast<platform::float16>(hlog(tmp));
}
namespace
{
template
<
typename
T
>
__global__
void
CrossEntropyKernel
(
T
*
Y
,
const
T
*
X
,
const
int64_t
*
label
,
...
...
@@ -35,12 +49,12 @@ template <typename T>
__global__
void
SoftCrossEntropyKernel
(
T
*
Y
,
const
T
*
X
,
const
T
*
label
,
const
int
class_num
)
{
int
tid
=
threadIdx
.
x
;
T
val
=
0
;
T
val
(
0
)
;
int
idx
=
blockIdx
.
x
*
class_num
+
tid
;
int
end
=
blockIdx
.
x
*
class_num
+
class_num
;
for
(;
idx
<
end
;
idx
+=
blockDim
.
x
)
{
val
+=
math
::
TolerableValue
<
T
>
()(
std
::
log
(
X
[
idx
]))
*
label
[
idx
];
val
+=
math
::
TolerableValue
<
T
>
()(
log
(
X
[
idx
]))
*
label
[
idx
];
}
val
=
paddle
::
platform
::
reduceSum
(
val
,
tid
,
blockDim
.
x
);
...
...
@@ -84,6 +98,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/cross_entropy.h
浏览文件 @
a72f68f2
...
...
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <limits>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace
paddle
{
...
...
@@ -33,6 +35,21 @@ struct TolerableValue {
}
};
// float16 value clip behave different.
using
paddle
::
platform
::
float16
;
using
paddle
::
platform
::
isfinite
;
template
<
>
struct
TolerableValue
<
float16
>
{
HOSTDEVICE
float16
operator
()(
const
float16
&
x
)
const
{
if
(
isfinite
(
x
))
return
x
;
else
if
(
x
>
static_cast
<
float16
>
(
0
))
return
std
::
numeric_limits
<
float16
>::
max
();
else
return
std
::
numeric_limits
<
float16
>::
min
();
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
CrossEntropyFunctor
{
public:
...
...
paddle/fluid/operators/math/selected_rows_functor.cu
浏览文件 @
a72f68f2
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -76,6 +77,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
{
template
<
typename
T
,
int
block_size
>
...
...
@@ -120,7 +122,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
auto
*
out_data
=
output
->
data
<
T
>
();
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
functor
;
functor
(
context
,
output
,
0.0
);
functor
(
context
,
output
,
static_cast
<
T
>
(
0
)
);
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
...
...
@@ -138,6 +140,8 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
<
typename
T
>
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
T
>
{
...
...
@@ -177,6 +181,8 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
{
template
<
typename
T
,
int
block_size
>
...
...
@@ -229,6 +235,8 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
scatter
{
...
...
@@ -276,7 +284,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
,
out
.
mutable_value
(),
0.0
);
constant_functor
(
context
,
out
.
mutable_value
(),
static_cast
<
T
>
(
0
)
);
auto
*
out_data
=
out
.
mutable_value
()
->
data
<
T
>
();
auto
*
input_data
=
input
.
value
().
data
<
T
>
();
...
...
@@ -300,6 +308,7 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
<
typename
T
,
int
block_size
>
__global__
void
UpdateToTensorKernel
(
const
T
*
selected_rows
,
...
...
paddle/fluid/operators/math/softmax.cu
浏览文件 @
a72f68f2
...
...
@@ -94,12 +94,15 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
>;
template
class
SoftmaxCUDNNFunctor
<
float
>;
template
class
SoftmaxCUDNNFunctor
<
double
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
...
...
paddle/fluid/operators/mean_op.cu
浏览文件 @
a72f68f2
...
...
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/mean_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
mean
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
mean_grad
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/mean_op.h
浏览文件 @
a72f68f2
...
...
@@ -55,7 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
IG
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
ig_size
=
static_cast
<
T
>
(
IG
->
numel
());
Eigen
::
DSizes
<
int
,
1
>
bcast
(
ig_size
);
Eigen
::
DSizes
<
int
,
1
>
bcast
(
static_cast
<
int
>
(
ig_size
)
);
EigenVector
<
T
>::
Flatten
(
*
IG
).
device
(
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
())
=
...
...
paddle/fluid/operators/mul_op.cu.cc
浏览文件 @
a72f68f2
...
...
@@ -20,6 +20,7 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL
(
mul
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
mul_grad
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
mul_grad
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/pool_cudnn_op.cu.cc
浏览文件 @
a72f68f2
...
...
@@ -174,7 +174,8 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
ops
::
PoolCUDNNOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool2d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNGradOpKernel
<
float
>
,
ops
::
PoolCUDNNGradOpKernel
<
double
>
);
ops
::
PoolCUDNNGradOpKernel
<
double
>
,
ops
::
PoolCUDNNGradOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool3d
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNOpKernel
<
float
>
,
...
...
@@ -182,4 +183,5 @@ REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
ops
::
PoolCUDNNOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool3d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNGradOpKernel
<
float
>
,
ops
::
PoolCUDNNGradOpKernel
<
double
>
);
ops
::
PoolCUDNNGradOpKernel
<
double
>
,
ops
::
PoolCUDNNGradOpKernel
<
plat
::
float16
>
);
paddle/fluid/operators/prelu_op.cc
浏览文件 @
a72f68f2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -26,14 +23,40 @@ class PReluOp : public framework::OperatorWithKernel {
:
OperatorWithKernel
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
std
::
string
mode
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"mode"
);
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) should not be null"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Alpha"
),
"Input(Alpha) should not be null"
);
PADDLE_ENFORCE
(
product
(
ctx
->
GetInputDim
(
"Alpha"
))
==
1
,
"Size of weight Alpha must be one."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) should not be null"
);
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"X"
));
if
(
mode
==
"all"
)
{
PADDLE_ENFORCE
(
product
(
ctx
->
GetInputDim
(
"Alpha"
))
==
1
,
"For mode 'all', size of weight Alpha must be one."
);
}
else
if
(
mode
==
"channel"
)
{
PADDLE_ENFORCE
(
product
(
ctx
->
GetInputDim
(
"Alpha"
))
==
x_dim
[
1
],
"For channel-wise mode, size of weight Alpha must be "
"equal to the number of channels, should be %d"
,
x_dim
[
1
]);
}
else
if
(
mode
==
"element"
)
{
PADDLE_ENFORCE
(
product
(
ctx
->
GetInputDim
(
"Alpha"
))
==
product
(
x_dim
),
"For element-wise mode, size of weight Alpha must be "
"equal to the number of input, should be %d"
,
product
(
x_dim
));
}
else
{
PADDLE_THROW
(
"Unkown mode %s"
,
mode
);
}
ctx
->
SetOutputDim
(
"Out"
,
x_dim
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
platform
::
CPUPlace
());
}
};
class
PReluOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
...
...
@@ -44,9 +67,7 @@ class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
"The output tensor of prelu operator."
);
AddComment
(
R"DOC(
PRelu Operator.
The equation is:
$$
f(x) =
\begin{cases}
...
...
@@ -54,11 +75,15 @@ f(x) =
x, \qquad \text{if} \ x >= 0
\end{cases}
$$
The input `X` can carry the LoD (Level of Details) information,
or not. And the output shares the LoD information with input `X`.
There are modes:
all: all elements share same weight
channel: elements in a channel share same weight
element: each element has a weight
)DOC"
);
AddAttr
<
std
::
string
>
(
"mode"
,
"The mode for inputs to share weights."
)
.
SetDefault
(
"all"
);
}
};
...
...
@@ -71,9 +96,23 @@ class PReluGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) must not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out@GRAD) should not be null"
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Alpha"
),
ctx
->
GetInputDim
(
"Alpha"
));
auto
x_grad_name
=
framework
::
GradVarName
(
"X"
);
auto
alpha_grad_name
=
framework
::
GradVarName
(
"Alpha"
);
if
(
ctx
->
HasOutput
(
x_grad_name
))
{
ctx
->
SetOutputDim
(
x_grad_name
,
ctx
->
GetInputDim
(
"X"
));
}
if
(
ctx
->
HasOutput
(
alpha_grad_name
))
{
ctx
->
SetOutputDim
(
alpha_grad_name
,
ctx
->
GetInputDim
(
"Alpha"
));
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
platform
::
CPUPlace
());
}
};
...
...
paddle/fluid/operators/prelu_op.cu
已删除
100644 → 0
浏览文件 @
df28a3b4
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/prelu_op.h"
REGISTER_OP_CUDA_KERNEL
(
prelu
,
paddle
::
operators
::
PReluKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
);
REGISTER_OP_CUDA_KERNEL
(
prelu_grad
,
paddle
::
operators
::
PReluGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
);
paddle/fluid/operators/prelu_op.h
浏览文件 @
a72f68f2
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -13,32 +10,16 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/transform.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
platform
::
Transform
;
template
<
typename
T
>
class
PReluFunctor
{
public:
explicit
PReluFunctor
(
const
T
*
alpha
)
:
alpha_
(
alpha
)
{}
HOSTDEVICE
T
operator
()(
const
T
&
x
)
const
{
if
(
x
>
0
)
return
x
;
else
return
x
*
(
*
alpha_
);
}
private:
const
T
*
alpha_
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
PReluKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -50,53 +31,93 @@ class PReluKernel : public framework::OpKernel<T> {
const
T
*
x_ptr
=
x
->
data
<
T
>
();
T
*
o_ptr
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
alpha_ptr
=
alpha
->
data
<
T
>
();
const
T
*
alpha_ptr
=
alpha
->
data
<
T
>
();
std
::
string
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
int
numel
=
x
->
numel
();
Transform
<
DeviceContext
>
trans
;
trans
(
context
.
template
device_context
<
DeviceContext
>(),
x_ptr
,
x_ptr
+
numel
,
o_ptr
,
PReluFunctor
<
T
>
(
alpha_ptr
));
}
};
template
<
typename
T
>
class
PReluGradFunctor
{
public:
explicit
PReluGradFunctor
(
const
T
*
alpha
)
:
alpha_
(
alpha
)
{}
HOSTDEVICE
T
operator
()(
const
T
&
out
,
const
T
&
dout
)
const
{
if
(
out
>
0
)
return
dout
;
else
return
dout
*
(
*
alpha_
);
auto
dim
=
x
->
dims
();
int
index
=
0
;
int
i
=
0
;
int
temp
=
0
;
if
(
mode
==
"channel"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
temp
=
numel
/
(
dim
[
0
]
*
dim
[
1
]);
index
=
(
i
/
temp
)
%
dim
[
1
];
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
index
]
*
x_ptr
[
i
];
}
}
else
if
(
mode
==
"element"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
i
]
*
x_ptr
[
i
];
}
}
else
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
0
]
*
x_ptr
[
i
];
}
}
}
private:
const
T
*
alpha_
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
PReluGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
dx
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dout
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dalpha
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Alpha"
));
auto
*
out
=
context
.
Input
<
Tensor
>
(
"Out"
);
auto
*
alpha
=
context
.
Input
<
Tensor
>
(
"Alpha"
);
auto
*
alpha_ptr
=
alpha
->
data
<
T
>
();
T
*
dx_ptr
=
dx
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
T
*
alpha_ptr
=
alpha
->
data
<
T
>
();
const
T
*
x_ptr
=
x
->
data
<
T
>
();
const
T
*
dout_ptr
=
dout
->
data
<
T
>
();
const
T
*
out_ptr
=
out
->
data
<
T
>
();
int
numel
=
dx
->
numel
();
Transform
<
DeviceContext
>
trans
;
trans
(
context
.
template
device_context
<
DeviceContext
>(),
out_ptr
,
out_ptr
+
numel
,
dout_ptr
,
dx_ptr
,
PReluGradFunctor
<
T
>
(
alpha_ptr
));
// TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
std
::
string
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
int
numel
=
x
->
numel
();
auto
dim
=
x
->
dims
();
int
index
=
0
;
int
i
=
0
;
int
temp
=
0
;
if
(
dx
)
{
T
*
dx_ptr
=
dx
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
mode
==
"channel"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
temp
=
numel
/
(
dim
[
0
]
*
dim
[
1
]);
index
=
(
i
/
temp
)
%
dim
[
1
];
dx_ptr
[
i
]
=
out_ptr
[
i
]
>
0
?
dout_ptr
[
i
]
:
alpha_ptr
[
index
]
*
dout_ptr
[
i
];
}
}
else
if
(
mode
==
"element"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
dx_ptr
[
i
]
=
out_ptr
[
i
]
>
0
?
dout_ptr
[
i
]
:
alpha_ptr
[
i
]
*
dout_ptr
[
i
];
}
}
else
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
dx_ptr
[
i
]
=
out_ptr
[
i
]
>
0
?
dout_ptr
[
i
]
:
alpha_ptr
[
0
]
*
dout_ptr
[
i
];
}
}
}
index
=
0
;
if
(
dalpha
)
{
T
*
dalpha_ptr
=
dalpha
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
mode
==
"channel"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
temp
=
numel
/
(
dim
[
0
]
*
dim
[
1
]);
index
=
(
i
/
temp
)
%
dim
[
1
];
dalpha_ptr
[
index
]
+=
out_ptr
[
i
]
>
0
?
0
:
x_ptr
[
i
]
*
dout_ptr
[
i
];
}
}
else
if
(
mode
==
"element"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
dalpha_ptr
[
i
]
+=
out_ptr
[
i
]
>
0
?
0
:
x_ptr
[
i
]
*
dout_ptr
[
i
];
}
}
else
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
dalpha_ptr
[
0
]
+=
out_ptr
[
i
]
>
0
?
0
:
x_ptr
[
i
]
*
dout_ptr
[
i
];
}
}
}
// TODO(Guanzhong): add GPU kernels
}
};
...
...
paddle/fluid/operators/scale_op.cu
浏览文件 @
a72f68f2
...
...
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/scale_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
scale
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
int64_t
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/scatter_op.h
浏览文件 @
a72f68f2
...
...
@@ -35,7 +35,7 @@ class ScatterOpKernel : public framework::OpKernel<T> {
auto
*
Out
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
// In place output: Out = X, Out[Ids] += Updates
Out
->
ShareDataWith
(
*
X
);
framework
::
TensorCopySync
(
*
X
,
ctx
.
GetPlace
(),
Out
);
// Apply ScatterUpdate: Out[index] += Updates[:]
ScatterAssign
<
T
>
(
ctx
.
device_context
(),
*
Updates
,
*
Ids
,
Out
);
}
...
...
@@ -53,7 +53,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
auto
*
dOut
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
// In place gradient: dX = dO
dX
->
ShareDataWith
(
*
dOut
);
framework
::
TensorCopySync
(
*
dOut
,
ctx
.
GetPlace
(),
dX
);
dUpdates
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Gradient by Gather: dUpdates += dO[Ids]
CPUGather
<
T
>
(
ctx
.
device_context
(),
*
dOut
,
*
Ids
,
dUpdates
);
...
...
paddle/fluid/operators/softmax_cudnn_op.cu.cc
浏览文件 @
a72f68f2
...
...
@@ -78,4 +78,5 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
ops
::
SoftmaxCUDNNKernel
<
float
>
,
ops
::
SoftmaxCUDNNKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
softmax_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
SoftmaxGradCUDNNKernel
<
float
>
);
ops
::
SoftmaxGradCUDNNKernel
<
float
>
,
ops
::
SoftmaxGradCUDNNKernel
<
plat
::
float16
>
);
paddle/fluid/operators/softmax_op.cu.cc
浏览文件 @
a72f68f2
...
...
@@ -23,4 +23,5 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
SoftmaxKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
softmax_grad
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
double
>
);
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/sum_op.cu
浏览文件 @
a72f68f2
...
...
@@ -11,10 +11,13 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
sum
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/sum_op.h
浏览文件 @
a72f68f2
...
...
@@ -46,7 +46,7 @@ class SumKernel : public framework::OpKernel<T> {
if
(
!
in_place
)
{
math
::
SetConstant
<
DeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
.
template
device_context
<
DeviceContext
>(),
out
,
0.0
);
static_cast
<
T
>
(
0
)
);
}
math
::
SelectedRowsAddToTensor
<
DeviceContext
,
T
>
functor
;
...
...
paddle/fluid/operators/top_k_op.cu
浏览文件 @
a72f68f2
...
...
@@ -11,16 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <limits>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
paddle
::
platform
::
float16
;
template
<
typename
T
>
struct
Pair
{
...
...
@@ -32,6 +35,11 @@ struct Pair {
id
=
id
;
}
__device__
__forceinline__
void
clear
()
{
v
=
-
INFINITY
;
id
=
-
1
;
}
__device__
__forceinline__
void
operator
=
(
const
Pair
<
T
>&
in
)
{
v
=
in
.
v
;
id
=
in
.
id
;
...
...
@@ -53,6 +61,12 @@ struct Pair {
int64_t
id
;
};
template
<
>
__device__
__forceinline__
void
Pair
<
float16
>::
clear
()
{
v
=
platform
::
raw_uint16_to_float16
(
0x400
);
id
=
-
1
;
}
template
<
typename
T
>
__device__
__forceinline__
void
AddTo
(
Pair
<
T
>
topk
[],
const
Pair
<
T
>&
p
,
int
beam_size
)
{
...
...
@@ -150,7 +164,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if
(
k
<
MaxLength
-
(
*
beam
))
{
topk
[
k
]
=
topk
[
k
+
*
beam
];
}
else
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
topk
[
k
].
clear
(
);
}
}
if
(
!
(
*
is_empty
))
{
...
...
@@ -160,7 +174,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
}
*
max
=
topk
[
MaxLength
-
1
];
if
((
*
max
).
v
==
-
1
)
*
is_empty
=
true
;
if
((
*
max
).
v
==
static_cast
<
T
>
(
-
1
)
)
*
is_empty
=
true
;
*
beam
=
0
;
}
}
...
...
@@ -181,7 +195,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if
(
k
<
MaxLength
-
*
beam
)
{
topk
[
k
]
=
topk
[
k
+
*
beam
];
}
else
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
topk
[
k
].
set
(
std
::
numeric_limits
<
T
>::
min
()
,
-
1
);
}
}
if
(
!
(
*
is_empty
))
{
...
...
@@ -273,7 +287,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
bool
firststep
=
true
;
for
(
int
k
=
0
;
k
<
MaxLength
;
k
++
)
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
topk
[
k
].
clear
(
);
}
while
(
k
)
{
ThreadGetTopK
<
T
,
MaxLength
,
BlockSize
>
(
topk
,
&
beam
,
k
,
...
...
@@ -325,5 +339,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
}
// namespace operators
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
top_k
,
paddle
::
operators
::
TopkOpCUDAKernel
<
float
>
,
paddle
::
operators
::
TopkOpCUDAKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
top_k
,
paddle
::
operators
::
TopkOpCUDAKernel
<
float
>
,
paddle
::
operators
::
TopkOpCUDAKernel
<
double
>
,
paddle
::
operators
::
TopkOpCUDAKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/uniform_random_op.cu
浏览文件 @
a72f68f2
...
...
@@ -11,10 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <thrust/random.h>
#include <thrust/transform.h>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/transform.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -36,6 +40,11 @@ struct UniformGenerator {
}
};
template
<
typename
T
,
typename
V
>
struct
CastFunctor
{
HOSTDEVICE
V
operator
()(
const
T
&
a
)
{
return
static_cast
<
V
>
(
a
);
}
};
// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
// Use std::random and thrust::random(thrust is a std library in CUDA) to
// implement uniform random.
...
...
@@ -66,18 +75,50 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
T
max
=
static_cast
<
T
>
(
context
.
Attr
<
float
>
(
"max"
));
thrust
::
counting_iterator
<
unsigned
int
>
index_sequence_begin
(
0
);
int64_t
size
=
tensor
->
numel
();
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
T
>
(
data
),
UniformGenerator
<
T
>
(
min
,
max
,
seed
));
if
(
out_var
->
IsType
<
framework
::
LoDTensor
>
()
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
framework
::
Tensor
master_copy_tensor
;
master_copy_tensor
.
Resize
(
tensor
->
dims
());
float
*
master_copy_tensor_data
=
master_copy_tensor
.
mutable_data
<
float
>
(
context
.
GetPlace
());
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
float
>
(
master_copy_tensor_data
),
UniformGenerator
<
float
>
(
static_cast
<
float
>
(
min
),
static_cast
<
float
>
(
max
),
seed
));
platform
::
Transform
<
platform
::
CUDADeviceContext
>
trans
;
auto
*
in_begin
=
master_copy_tensor
.
data
<
float
>
();
auto
*
in_end
=
in_begin
+
master_copy_tensor
.
numel
();
auto
*
out_begin
=
tensor
->
mutable_data
<
T
>
(
context
.
GetPlace
());
trans
(
context
.
template
device_context
<
platform
::
CUDADeviceContext
>(),
in_begin
,
in_end
,
out_begin
,
CastFunctor
<
float
,
T
>
());
}
else
{
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
T
>
(
data
),
UniformGenerator
<
T
>
(
min
,
max
,
seed
));
}
if
(
VLOG_IS_ON
(
5
))
{
framework
::
Tensor
cpu_tensor
;
framework
::
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
cpu_tensor
);
auto
&
dev_ctx
=
*
platform
::
DeviceContextPool
::
Instance
().
Get
(
context
.
GetPlace
());
dev_ctx
.
Wait
();
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
cpu_tensor
);
VLOG
(
5
)
<<
"The Uniform output "
<<
x
;
}
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
uniform_random
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
uniform_random_batch_size_like
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
);
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
uniform_random
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
uniform_random_batch_size_like
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
plat
::
float16
>
);
python/paddle/fluid/__init__.py
浏览文件 @
a72f68f2
...
...
@@ -122,7 +122,7 @@ def __bootstrap__():
'use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'warpctc_dir'
,
'eager_delete_scope'
,
'use_mkldnn'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem'
,
'free_idle_memory'
,
'paddle_num_threads'
,
'cpu_deterministic'
"dist_threadpool_size"
,
'cpu_deterministic'
]
if
core
.
is_compiled_with_dist
():
read_env_flags
.
append
(
'rpc_deadline'
)
...
...
python/paddle/fluid/initializer.py
浏览文件 @
a72f68f2
...
...
@@ -15,7 +15,6 @@
from
.
import
framework
import
numpy
as
np
import
contextlib
from
.framework
import
convert_np_dtype_to_dtype_
from
.core
import
VarDesc
__all__
=
[
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
a72f68f2
...
...
@@ -112,6 +112,7 @@ __all__ = [
'log'
,
'crop'
,
'rank_loss'
,
'prelu'
,
'flatten'
,
]
...
...
@@ -5089,7 +5090,7 @@ def random_crop(x, shape, seed=None):
return
out
def
log
(
x
):
def
log
(
x
,
name
=
None
):
"""
Calculates the natural log of the given input tensor, element-wise.
...
...
@@ -5099,6 +5100,8 @@ def log(x):
Args:
x (Variable): Input tensor.
name (str|None, default None): A name for this layer If set None,
the layer will be named automatically.
Returns:
Variable: The natural log of the input tensor computed element-wise.
...
...
@@ -5116,7 +5119,7 @@ def log(x):
return
out
def
relu
(
x
):
def
relu
(
x
,
name
=
None
):
"""
Relu takes one input data (Tensor) and produces one output data (Tensor)
where the rectified linear function, y = max(0, x), is applied to
...
...
@@ -5128,6 +5131,8 @@ def relu(x):
Args:
x (Variable): The input tensor.
name (str|None, default None): A name for this layer If set None,
the layer will be named automatically.
Returns:
Variable: The output tensor with the same shape as input.
...
...
@@ -5364,6 +5369,59 @@ def rank_loss(label, left, right, name=None):
return
out
def
prelu
(
x
,
mode
,
param_attr
=
None
,
name
=
None
):
"""
Equation:
y = \max(0, x) + alpha \min(0, x)
Args:
x (Variable): The input tensor.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight (alpha).
mode (string): The mode for weight sharing
all: all elements share same weight
channel:elements in a channel share same weight
element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The output tensor with the same shape as input.
Examples:
.. code-block:: python
x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
mode = 'channel'
output = fluid.layers.prelu(x,mode)
"""
helper
=
LayerHelper
(
'prelu'
,
**
locals
())
if
mode
not
in
[
'all'
,
'channel'
,
'element'
]:
raise
ValueError
(
'mode should be one of all, channel, element.'
)
alpha_shape
=
[
1
]
if
mode
==
'channel'
:
alpha_shape
=
[
1
,
x
.
shape
[
1
],
1
,
1
]
elif
mode
==
'element'
:
alpha_shape
=
x
.
shape
dtype
=
helper
.
input_dtype
(
input_param_name
=
'x'
)
alpha
=
helper
.
create_parameter
(
attr
=
param_attr
,
shape
=
alpha_shape
,
dtype
=
'float32'
,
is_bias
=
False
,
default_initializer
=
Constant
(
1.0
))
out
=
helper
.
create_tmp_variable
(
dtype
)
helper
.
append_op
(
type
=
"prelu"
,
inputs
=
{
"X"
:
x
,
'Alpha'
:
alpha
},
attrs
=
{
"mode"
:
mode
},
outputs
=
{
"Out"
:
out
})
return
out
def
flatten
(
x
,
axis
=
1
,
name
=
None
):
"""
**Flatten layer**
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
a72f68f2
...
...
@@ -59,8 +59,8 @@ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=$
if
(
WITH_DISTRIBUTE
)
py_test_modules
(
test_dist_train MODULES test_dist_train SERIAL
)
set_tests_properties
(
test_listen_and_serv_op PROPERTIES TIMEOUT 20
)
set_tests_properties
(
test_dist_mnist PROPERTIES TIMEOUT
18
0
)
set_tests_properties
(
test_dist_word2vec PROPERTIES TIMEOUT
18
0
)
set_tests_properties
(
test_dist_mnist PROPERTIES TIMEOUT
20
0
)
set_tests_properties
(
test_dist_word2vec PROPERTIES TIMEOUT
20
0
)
endif
()
py_test_modules
(
test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL
)
py_test_modules
(
test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL
)
...
...
python/paddle/fluid/tests/unittests/test_dist_train.py
浏览文件 @
a72f68f2
...
...
@@ -26,6 +26,12 @@ from paddle.fluid.layers.io import ListenAndServ
from
paddle.fluid.layers.io
import
Recv
from
paddle.fluid.layers.io
import
Send
from
paddle.fluid
import
core
RPC_OP_ROLE_ATTR_NAME
=
op_role_attr_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
(
)
RPC_OP_ROLE_ATTR_VALUE
=
core
.
op_proto_and_checker_maker
.
OpRole
.
RPC
class
TestSendOp
(
unittest
.
TestCase
):
def
test_send
(
self
):
...
...
@@ -89,18 +95,29 @@ class TestSendOp(unittest.TestCase):
def
init_client
(
self
,
place
,
port
):
main
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
):
main
.
global_block
().
append_op
(
type
=
"fetch_barrier"
,
inputs
=
{},
outputs
=
{},
attrs
=
{
"endpoints"
:
[
"127.0.0.1:{0}"
.
format
(
port
)],
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
x
=
layers
.
data
(
shape
=
[
32
,
32
],
dtype
=
'float32'
,
name
=
'X'
,
append_batch_size
=
False
)
fluid
.
initializer
.
Constant
(
value
=
2.3
)(
x
,
main
.
global_block
())
get_var
=
main
.
global_block
().
create_var
(
name
=
"scale_0.tmp_0"
,
# server side var
dtype
=
"float32"
,
persistable
=
False
,
shape
=
[
32
,
32
])
fluid
.
initializer
.
Constant
(
value
=
2.3
)(
get_var
,
main
.
global_block
())
Send
(
"127.0.0.1:%d"
%
port
,
[
x
])
o
=
Recv
(
"127.0.0.1:%d"
%
port
,
[
get_var
])
...
...
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
浏览文件 @
a72f68f2
...
...
@@ -18,6 +18,7 @@ import unittest
import
paddle.fluid
as
fluid
from
paddle.fluid.transpiler.distribute_transpiler
import
delete_ops
import
traceback
import
collections
class
TranspilerTest
(
unittest
.
TestCase
):
...
...
@@ -53,9 +54,18 @@ class TranspilerTest(unittest.TestCase):
self
.
origin_prog
=
main
.
clone
()
return
main
def
get_trainer
(
self
,
config
=
None
,
sync_mode
=
True
):
t
=
self
.
_transpiler_instance
(
config
,
sync_mode
)
return
t
.
get_trainer_program
()
def
get_trainer
(
self
,
config
=
None
):
src
=
fluid
.
default_startup_program
().
clone
()
t
=
self
.
_transpiler_instance
(
config
)
trainer_main
=
t
.
get_trainer_program
()
trainer_startup
=
fluid
.
default_startup_program
()
assert
(
src
.
num_blocks
==
1
)
assert
(
trainer_startup
.
num_blocks
==
src
.
num_blocks
)
return
trainer_main
,
trainer_startup
def
get_pserver
(
self
,
ep
,
config
=
None
,
sync_mode
=
True
):
t
=
self
.
_transpiler_instance
(
config
,
sync_mode
)
...
...
@@ -91,7 +101,21 @@ class TestBasicModel(TranspilerTest):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
pserver2
,
startup2
=
self
.
get_pserver
(
self
.
pserver2_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
trainer_startup
=
self
.
get_trainer
()
# splited var blocks should be in startup program
self
.
assertTrue
(
"fc_w.block0"
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_w.block1"
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_w"
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_b"
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_w@GRAD"
not
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_b@GRAD"
not
in
trainer_startup
.
global_block
().
vars
)
src
=
[
op
.
type
for
op
in
trainer_startup
.
global_block
().
ops
]
dst
=
[
'fill_constant'
,
'fill_constant'
,
'uniform_random'
,
'recv'
,
'recv'
,
\
'fetch_barrier'
,
'concat'
]
self
.
assertEqual
(
src
,
dst
)
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
global_block
().
ops
],
[
'mul'
,
'elementwise_add'
,
'elementwise_sub'
,
'square'
,
'mean'
,
...
...
@@ -142,7 +166,7 @@ class TestBasicModelWithLargeBlockSize(TranspilerTest):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
,
config
)
pserver2
,
startup2
=
self
.
get_pserver
(
self
.
pserver2_ep
,
config
)
trainer
=
self
.
get_trainer
(
config
)
trainer
,
_
=
self
.
get_trainer
(
config
)
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
global_block
().
ops
],
[
'mul'
,
'elementwise_add'
,
'elementwise_sub'
,
'square'
,
'mean'
,
...
...
@@ -226,7 +250,7 @@ class TestLRDecay(TranspilerTest):
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
pserver
.
blocks
),
4
)
lr_decay_ops
=
[
op
.
type
for
op
in
pserver
.
blocks
[
1
].
ops
]
...
...
@@ -256,7 +280,7 @@ class TestLRDecayConditional(TranspilerTest):
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
serv_op
=
pserver
.
blocks
[
0
].
ops
[
0
]
sub_blocks
=
[]
...
...
@@ -305,7 +329,7 @@ class TestL2Decay(TranspilerTest):
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
pserver
.
blocks
),
3
)
self
.
assertEqual
([
op
.
type
for
op
in
pserver
.
blocks
[
1
].
ops
],
...
...
@@ -340,7 +364,7 @@ class TestL2DecayWithPiecewise(TranspilerTest):
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
pserver
.
blocks
),
9
)
self
.
assertEqual
([
op
.
type
for
op
in
pserver
.
blocks
[
1
].
ops
],
[
...
...
@@ -415,7 +439,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"sum"
,
"adam"
,
"scale"
,
"scale"
])
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'lookup_table'
,
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
...
...
@@ -453,7 +477,7 @@ class TestDistLookupTable(TestDistLookupTableBase):
# 5 save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
5
].
ops
],
[
"save"
])
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'split_ids'
,
'prefetch'
,
'merge_ids'
,
'sequence_pool'
,
'split_ids'
,
...
...
@@ -486,7 +510,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"adam"
,
"scale"
,
"scale"
])
trainer
=
self
.
get_trainer
(
config
)
trainer
,
_
=
self
.
get_trainer
(
config
)
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'lookup_table'
,
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
...
...
@@ -525,7 +549,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
# 5 save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
5
].
ops
],
[
"save"
])
trainer
=
self
.
get_trainer
(
config
)
trainer
,
_
=
self
.
get_trainer
(
config
)
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'split_ids'
,
'prefetch'
,
'merge_ids'
,
'sequence_pool'
,
'split_ids'
,
...
...
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
a72f68f2
...
...
@@ -21,6 +21,7 @@ import paddle.fluid.nets as nets
from
paddle.fluid.framework
import
Program
,
program_guard
,
default_main_program
from
paddle.fluid.param_attr
import
ParamAttr
import
decorators
from
paddle.fluid.initializer
import
Constant
class
TestBook
(
unittest
.
TestCase
):
...
...
@@ -485,6 +486,20 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
def
test_prelu
(
self
):
program
=
Program
()
with
program_guard
(
program
):
input
=
layers
.
data
(
name
=
"input"
,
shape
=
[
5
,
200
,
100
,
100
],
dtype
=
"float32"
)
mode
=
'channel'
out
=
layers
.
prelu
(
input
,
mode
,
param_attr
=
ParamAttr
(
initializer
=
Constant
(
1.0
)),
name
=
'prelu'
)
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_prelu_op.py
浏览文件 @
a72f68f2
...
...
@@ -20,30 +20,58 @@ from op_test import OpTest
class
PReluTest
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"prelu"
x_np
=
np
.
random
.
normal
(
size
=
(
10
,
10
)).
astype
(
"float32"
)
for
pos
,
val
in
np
.
ndenumerate
(
x_np
):
# Since zero point in prelu is not differentiable, avoid randomize
# zero.
while
abs
(
val
)
<
1e-3
:
x_np
[
pos
]
=
np
.
random
.
normal
()
val
=
x_np
[
pos
]
x_np_sign
=
np
.
sign
(
x_np
)
x_np
=
x_np_sign
*
np
.
maximum
(
x_np
,
.
005
)
alpha_np
=
np
.
array
([.
1
],
dtype
=
"float32"
)
self
.
inputs
=
{
'X'
:
x_np
,
'Alpha'
:
alpha_np
}
self
.
initTestCase
()
x_np
=
np
.
random
.
normal
(
size
=
(
3
,
5
,
5
,
10
)).
astype
(
"float32"
)
# Since zero point in prelu is not differentiable, avoid randomize
# zero.
x_np
[
np
.
abs
(
x_np
)
<
0.005
]
=
0.02
if
self
.
attrs
==
{
'mode'
:
"all"
}:
alpha_np
=
np
.
random
.
rand
(
1
).
astype
(
"float32"
)
self
.
inputs
=
{
'X'
:
x_np
,
'Alpha'
:
alpha_np
}
elif
self
.
attrs
==
{
'mode'
:
"channel"
}:
alpha_np
=
np
.
random
.
rand
(
1
,
x_np
.
shape
[
1
],
1
,
1
).
astype
(
"float32"
)
self
.
inputs
=
{
'X'
:
x_np
,
'Alpha'
:
alpha_np
}
else
:
alpha_np
=
np
.
random
.
rand
(
*
x_np
.
shape
).
astype
(
"float32"
)
self
.
inputs
=
{
'X'
:
x_np
,
'Alpha'
:
alpha_np
}
out_np
=
np
.
maximum
(
self
.
inputs
[
'X'
],
0.
)
out_np
=
out_np
+
np
.
minimum
(
self
.
inputs
[
'X'
],
0.
)
*
self
.
inputs
[
'Alpha'
]
assert
out_np
is
not
self
.
inputs
[
'X'
]
self
.
outputs
=
{
'Out'
:
out_np
}
def
initTestCase
(
self
):
self
.
attrs
=
{
'mode'
:
"channel"
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
self
.
check_grad
([
'X'
,
'Alpha'
],
'Out'
)
def
test_check_grad_ignore_x
(
self
):
self
.
check_grad
([
'Alpha'
],
'Out'
,
no_grad_set
=
set
(
'X'
))
def
test_check_grad_ignore_alpha
(
self
):
self
.
check_grad
([
'X'
],
'Out'
,
no_grad_set
=
set
(
'Alpha'
))
class
TestCase1
(
PReluTest
):
def
initTestCase
(
self
):
self
.
attrs
=
{
'mode'
:
"all"
}
class
TestCase2
(
PReluTest
):
def
initTestCase
(
self
):
self
.
attrs
=
{
'mode'
:
"channel"
}
class
TestCase3
(
PReluTest
):
def
initTestCase
(
self
):
self
.
attrs
=
{
'mode'
:
"element"
}
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
a72f68f2
...
...
@@ -195,6 +195,9 @@ class DistributeTranspiler(object):
if
program
is
None
:
program
=
default_main_program
()
self
.
origin_program
=
program
self
.
origin_startup_program
=
default_startup_program
().
clone
()
self
.
startup_program
=
default_startup_program
()
self
.
trainer_num
=
trainers
self
.
sync_mode
=
sync_mode
self
.
trainer_id
=
trainer_id
...
...
@@ -205,10 +208,10 @@ class DistributeTranspiler(object):
ps_dispatcher
=
self
.
config
.
split_method
(
self
.
pserver_endpoints
)
self
.
has_distributed_lookup_table
=
self
.
_has_distributed_lookup_table
()
# split and create vars, then put splited vars in dicts for later use.
# s
tep 1: s
plit and create vars, then put splited vars in dicts for later use.
self
.
_init_splited_vars
()
# step
3.1
: insert send op to send gradient vars to parameter servers
# step
2
: insert send op to send gradient vars to parameter servers
ps_dispatcher
.
reset
()
send_vars
=
[]
...
...
@@ -265,7 +268,7 @@ class DistributeTranspiler(object):
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
# step 3
.2
: insert recv op to receive parameters from parameter server
# step 3: insert recv op to receive parameters from parameter server
recv_vars
=
[]
for
_
,
var
in
enumerate
(
send_vars
):
recv_vars
.
append
(
self
.
grad_param_mapping
[
var
])
...
...
@@ -312,6 +315,8 @@ class DistributeTranspiler(object):
outputs
=
{
"Out"
:
[
orig_param
]},
attrs
=
{
"axis"
:
0
})
self
.
_get_trainer_startup_program
(
recv_vars
=
recv_vars
,
eplist
=
eplist
)
if
self
.
has_distributed_lookup_table
:
self
.
_replace_lookup_table_op_with_prefetch
(
program
,
pserver_endpoints
)
...
...
@@ -328,8 +333,78 @@ class DistributeTranspiler(object):
# FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
delete_ops
(
self
.
origin_program
.
global_block
(),
self
.
optimize_ops
)
self
.
origin_program
.
__str__
()
return
self
.
origin_program
def
_get_trainer_startup_program
(
self
,
recv_vars
,
eplist
,
startup_program
=
None
):
"""
Get transpiled trainer side startup program.
Args:
startup_program(Program): Startup program.
Returns:
Program: trainer side startup program.
"""
if
startup_program
is
None
:
startup_program
=
self
.
startup_program
# FIXME(gongwb): delete not need ops.
# note that: some parameter is not trainable and those ops can't be deleted.
for
varname
,
splited_var
in
self
.
param_var_mapping
.
iteritems
():
# Get the eplist of recv vars
eps
=
[]
for
var
in
splited_var
:
index
=
[
v
.
name
for
v
in
recv_vars
].
index
(
var
.
name
)
eps
.
append
(
eplist
[
index
])
for
var
in
splited_var
:
if
startup_program
.
global_block
().
has_var
(
var
.
name
):
continue
startup_program
.
global_block
().
create_var
(
name
=
var
.
name
,
persistable
=
False
,
type
=
var
.
type
,
dtype
=
var
.
dtype
,
shape
=
var
.
shape
,
lod_level
=
var
.
lod_level
)
op
=
startup_program
.
global_block
().
append_op
(
type
=
"recv"
,
inputs
=
{},
outputs
=
{
"Out"
:
splited_var
},
attrs
=
{
"epmap"
:
eps
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
startup_program
.
global_block
().
append_op
(
type
=
"fetch_barrier"
,
inputs
=
{},
outputs
=
{},
attrs
=
{
"endpoints"
:
self
.
pserver_endpoints
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
for
varname
,
splited_var
in
self
.
param_var_mapping
.
iteritems
():
#add concat ops to merge splited parameters received from parameter servers.
if
len
(
splited_var
)
<=
1
:
continue
orig_param
=
startup_program
.
global_block
().
vars
[
varname
]
startup_program
.
global_block
().
append_op
(
type
=
"concat"
,
inputs
=
{
"X"
:
splited_var
},
outputs
=
{
"Out"
:
[
orig_param
]},
attrs
=
{
"axis"
:
0
})
return
startup_program
def
get_pserver_program
(
self
,
endpoint
):
"""
Get parameter server side program.
...
...
@@ -576,14 +651,16 @@ class DistributeTranspiler(object):
new_outputs
=
dict
()
# do not append startup op if var is not on this pserver
op_on_pserver
=
False
for
key
in
op
.
output_names
:
newname
,
_
=
_get_splited_name_and_shape
(
op
.
output
(
key
)[
0
])
if
newname
:
op_on_pserver
=
True
new_outputs
[
key
]
=
created_var_map
[
newname
]
elif
op
.
output
(
key
)[
0
]
in
pserver_vars
:
op_on_pserver
=
True
new_outputs
[
key
]
=
pserver_vars
[
op
.
output
(
key
)[
0
]]
# TODO(gongwb): remove this line.
if
op
.
type
not
in
[
"recv"
,
"fetch_barrier"
,
"concat"
]:
for
key
in
op
.
output_names
:
newname
,
_
=
_get_splited_name_and_shape
(
op
.
output
(
key
)[
0
])
if
newname
:
op_on_pserver
=
True
new_outputs
[
key
]
=
created_var_map
[
newname
]
elif
op
.
output
(
key
)[
0
]
in
pserver_vars
:
op_on_pserver
=
True
new_outputs
[
key
]
=
pserver_vars
[
op
.
output
(
key
)[
0
]]
if
op_on_pserver
:
# most startup program ops have no inputs
...
...
@@ -1022,7 +1099,6 @@ class DistributeTranspiler(object):
var_mapping
[
varname
]
=
\
[
program
.
global_block
().
var
(
orig_var
.
name
)]
continue
var_mapping
[
varname
]
=
[]
orig_shape
=
orig_var
.
shape
orig_dim1_flatten
=
1
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录