Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
e398348e
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e398348e
编写于
8月 16, 2018
作者:
M
minqiyang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into port_pybind11
上级
99d3f089
340a104c
变更
71
隐藏空白更改
内联
并排
Showing
71 changed file
with
1344 addition
and
393 deletion
+1344
-393
.gitignore
.gitignore
+2
-0
CMakeLists.txt
CMakeLists.txt
+2
-1
cmake/configure.cmake
cmake/configure.cmake
+8
-0
cmake/external/anakin.cmake
cmake/external/anakin.cmake
+2
-3
paddle/fluid/API.spec
paddle/fluid/API.spec
+3
-2
paddle/fluid/framework/attribute.h
paddle/fluid/framework/attribute.h
+4
-3
paddle/fluid/framework/threadpool.cc
paddle/fluid/framework/threadpool.cc
+7
-0
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+1
-1
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+3
-6
paddle/fluid/operators/activation_op.cu
paddle/fluid/operators/activation_op.cu
+3
-1
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+6
-6
paddle/fluid/operators/assign_value_op.cu.cc
paddle/fluid/operators/assign_value_op.cu.cc
+4
-1
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+36
-20
paddle/fluid/operators/cross_entropy_op.cu
paddle/fluid/operators/cross_entropy_op.cu
+8
-4
paddle/fluid/operators/distributed/variable_response.cc
paddle/fluid/operators/distributed/variable_response.cc
+6
-1
paddle/fluid/operators/elementwise_add_op.cu
paddle/fluid/operators/elementwise_add_op.cu
+2
-1
paddle/fluid/operators/elementwise_div_op.cu
paddle/fluid/operators/elementwise_div_op.cu
+7
-2
paddle/fluid/operators/elementwise_mul_op.cu
paddle/fluid/operators/elementwise_mul_op.cu
+7
-1
paddle/fluid/operators/elementwise_op_function.h
paddle/fluid/operators/elementwise_op_function.h
+2
-2
paddle/fluid/operators/elementwise_sub_op.cu
paddle/fluid/operators/elementwise_sub_op.cu
+7
-1
paddle/fluid/operators/fc_mkldnn_op.cc
paddle/fluid/operators/fc_mkldnn_op.cc
+7
-2
paddle/fluid/operators/fc_op.cc
paddle/fluid/operators/fc_op.cc
+80
-16
paddle/fluid/operators/fill_constant_op.cc
paddle/fluid/operators/fill_constant_op.cc
+19
-34
paddle/fluid/operators/fill_constant_op.cu.cc
paddle/fluid/operators/fill_constant_op.cu.cc
+26
-0
paddle/fluid/operators/fill_constant_op.h
paddle/fluid/operators/fill_constant_op.h
+48
-0
paddle/fluid/operators/fill_op.cc
paddle/fluid/operators/fill_op.cc
+1
-1
paddle/fluid/operators/gaussian_random_op.cu
paddle/fluid/operators/gaussian_random_op.cu
+2
-0
paddle/fluid/operators/gru_op.cc
paddle/fluid/operators/gru_op.cc
+159
-3
paddle/fluid/operators/gru_op.cu.cc
paddle/fluid/operators/gru_op.cu.cc
+90
-0
paddle/fluid/operators/gru_op.h
paddle/fluid/operators/gru_op.h
+0
-84
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+4
-1
paddle/fluid/operators/math/blas.h
paddle/fluid/operators/math/blas.h
+41
-0
paddle/fluid/operators/math/blas_impl.h
paddle/fluid/operators/math/blas_impl.h
+75
-0
paddle/fluid/operators/math/cross_entropy.cu
paddle/fluid/operators/math/cross_entropy.cu
+18
-2
paddle/fluid/operators/math/cross_entropy.h
paddle/fluid/operators/math/cross_entropy.h
+17
-0
paddle/fluid/operators/math/selected_rows_functor.cu
paddle/fluid/operators/math/selected_rows_functor.cu
+11
-2
paddle/fluid/operators/math/softmax.cu
paddle/fluid/operators/math/softmax.cu
+3
-0
paddle/fluid/operators/mean_op.cu
paddle/fluid/operators/mean_op.cu
+6
-4
paddle/fluid/operators/mean_op.h
paddle/fluid/operators/mean_op.h
+1
-1
paddle/fluid/operators/mul_op.cu.cc
paddle/fluid/operators/mul_op.cu.cc
+4
-3
paddle/fluid/operators/pool_cudnn_op.cu.cc
paddle/fluid/operators/pool_cudnn_op.cu.cc
+4
-2
paddle/fluid/operators/prelu_op.cc
paddle/fluid/operators/prelu_op.cc
+52
-13
paddle/fluid/operators/prelu_op.cu
paddle/fluid/operators/prelu_op.cu
+0
-22
paddle/fluid/operators/prelu_op.h
paddle/fluid/operators/prelu_op.h
+73
-52
paddle/fluid/operators/scale_op.cu
paddle/fluid/operators/scale_op.cu
+5
-1
paddle/fluid/operators/scatter_op.h
paddle/fluid/operators/scatter_op.h
+2
-2
paddle/fluid/operators/softmax_cudnn_op.cu.cc
paddle/fluid/operators/softmax_cudnn_op.cu.cc
+2
-1
paddle/fluid/operators/softmax_op.cu.cc
paddle/fluid/operators/softmax_op.cu.cc
+2
-1
paddle/fluid/operators/sum_op.cu
paddle/fluid/operators/sum_op.cu
+4
-1
paddle/fluid/operators/sum_op.h
paddle/fluid/operators/sum_op.h
+1
-1
paddle/fluid/operators/top_k_op.cu
paddle/fluid/operators/top_k_op.cu
+22
-6
paddle/fluid/operators/uniform_random_op.cu
paddle/fluid/operators/uniform_random_op.cu
+50
-9
paddle/fluid/platform/dynload/mklml.h
paddle/fluid/platform/dynload/mklml.h
+8
-0
paddle/fluid/platform/enforce.h
paddle/fluid/platform/enforce.h
+6
-4
paddle/fluid/platform/enforce_test.cc
paddle/fluid/platform/enforce_test.cc
+19
-11
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+2
-1
python/CMakeLists.txt
python/CMakeLists.txt
+3
-2
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-1
python/paddle/fluid/initializer.py
python/paddle/fluid/initializer.py
+0
-1
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+60
-2
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-2
python/paddle/fluid/tests/unittests/test_dist_train.py
python/paddle/fluid/tests/unittests/test_dist_train.py
+17
-0
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+37
-13
python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
+2
-7
python/paddle/fluid/tests/unittests/test_fc_op.py
python/paddle/fluid/tests/unittests/test_fc_op.py
+90
-0
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+15
-0
python/paddle/fluid/tests/unittests/test_prelu_op.py
python/paddle/fluid/tests/unittests/test_prelu_op.py
+42
-14
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+88
-12
tools/manylinux1/Dockerfile.x64
tools/manylinux1/Dockerfile.x64
+1
-1
tools/manylinux1/build_scripts/build.sh
tools/manylinux1/build_scripts/build.sh
+1
-1
tools/manylinux1/build_scripts/install_nccl2.sh
tools/manylinux1/build_scripts/install_nccl2.sh
+1
-1
未找到文件。
.gitignore
浏览文件 @
e398348e
...
...
@@ -5,6 +5,7 @@ python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
*.DS_Store
*.vs
build/
build_doc/
*.user
...
...
@@ -15,6 +16,7 @@ build_doc/
.cproject
.pydevproject
.settings/
CMakeSettings.json
Makefile
.test_env/
third_party/
...
...
CMakeLists.txt
浏览文件 @
e398348e
...
...
@@ -204,11 +204,12 @@ include(external/snappy) # download snappy
include
(
external/snappystream
)
include
(
external/threadpool
)
set
(
WITH_ANAKIN OFF CACHE STRING
"Disable Anakin first, will add it later."
FORCE
)
if
(
WITH_GPU
)
include
(
cuda
)
include
(
tensorrt
)
include
(
external/anakin
)
elseif
()
set
(
WITH_ANAKIN OFF CACHE STRING
"Anakin is used in GPU only now."
FORCE
)
endif
()
include
(
cudnn
)
# set cudnn libraries, must before configure
...
...
cmake/configure.cmake
浏览文件 @
e398348e
...
...
@@ -56,6 +56,10 @@ if(NOT CMAKE_CROSSCOMPILING)
set
(
SIMD_FLAG
${
SSE3_FLAG
}
)
endif
()
endif
()
if
(
UNIX AND NOT APPLE
)
# except apple from nix*Os family
set
(
LINUX TRUE
)
endif
(
UNIX AND NOT APPLE
)
if
(
NOT WITH_GOLANG
)
add_definitions
(
-DPADDLE_WITHOUT_GOLANG
)
...
...
@@ -104,6 +108,10 @@ if(WITH_GPU)
if
(
${
CUDNN_MAJOR_VERSION
}
VERSION_LESS 7
)
message
(
FATAL_ERROR
"Anakin needs CUDNN >= 7.0 to compile"
)
endif
()
set
(
ENV{CUDNN_INCLUDE_DIR}
${
CUDNN_INCLUDE_DIR
}
)
set
(
ENV{CUDNN_LIBRARY}
${
CUDNN_LIBRARY
}
)
message
(
STATUS
"cudnn include header is
${
CUDNN_INCLUDE_DIR
}
/cudnn.h"
)
message
(
STATUS
"cudnn library is
${
CUDNN_LIBRARY
}
"
)
endif
()
elseif
(
WITH_AMD_GPU
)
add_definitions
(
-DPADDLE_WITH_HIP
)
...
...
cmake/external/anakin.cmake
浏览文件 @
e398348e
...
...
@@ -35,9 +35,8 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
ExternalProject_Add
(
extern_anakin
${
EXTERNAL_PROJECT_LOG_ARGS
}
# TODO(luotao): use PaddlePaddle/Anakin later
GIT_REPOSITORY
"https://github.com/luotao1/Anakin"
GIT_TAG
"3957ae9263eaa0b1986758dac60a88852afb09be"
GIT_REPOSITORY
"https://github.com/PaddlePaddle/Anakin"
GIT_TAG
"04256ba78fa3da0beb74e8036c8efd68c12824d6"
PREFIX
${
ANAKIN_SOURCE_DIR
}
UPDATE_COMMAND
""
CMAKE_ARGS -DUSE_GPU_PLACE=YES
...
...
paddle/fluid/API.spec
浏览文件 @
e398348e
...
...
@@ -155,10 +155,11 @@ paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale',
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.relu ArgSpec(args=['x'
], varargs=None, keywords=None, defaults=None
)
paddle.fluid.layers.log ArgSpec(args=['x'
], varargs=None, keywords=None, defaults=None
)
paddle.fluid.layers.relu ArgSpec(args=['x'
, 'name'], varargs=None, keywords=None, defaults=(None,)
)
paddle.fluid.layers.log ArgSpec(args=['x'
, 'name'], varargs=None, keywords=None, defaults=(None,)
)
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
...
...
paddle/fluid/framework/attribute.h
浏览文件 @
e398348e
...
...
@@ -128,7 +128,8 @@ struct ExtractAttribute {
attr_value
=
&
boost
::
get
<
T
>
(
attr
);
}
catch
(
boost
::
bad_get
&
bad_get
)
{
PADDLE_THROW
(
"Cannot get attribute %s by type %s, its type is %s"
,
attr_name_
,
typeid
(
T
).
name
(),
attr
.
type
().
name
());
attr_name_
,
paddle
::
platform
::
demangle
(
typeid
(
T
).
name
()),
paddle
::
platform
::
demangle
(
attr
.
type
().
name
()));
}
return
attr_value
;
}
...
...
@@ -160,7 +161,7 @@ struct ExtractAttribute<bool> {
attr_value
=
&
boost
::
get
<
bool
>
(
attr
);
}
catch
(
boost
::
bad_get
&
bad_get
)
{
PADDLE_THROW
(
"Cannot get attribute %s by type bool, its type is %s"
,
attr_name_
,
attr
.
type
().
name
(
));
attr_name_
,
paddle
::
platform
::
demangle
(
attr
.
type
().
name
()
));
}
return
attr_value
;
}
...
...
@@ -186,7 +187,7 @@ struct ExtractAttribute<int64_t> {
attr_value
=
&
boost
::
get
<
int64_t
>
(
attr
);
}
catch
(
boost
::
bad_get
&
bad_get
)
{
PADDLE_THROW
(
"Cannot get attribute %s by type int64_t, its type is %s"
,
attr_name_
,
attr
.
type
().
name
(
));
attr_name_
,
paddle
::
platform
::
demangle
(
attr
.
type
().
name
()
));
}
return
attr_value
;
}
...
...
paddle/fluid/framework/threadpool.cc
浏览文件 @
e398348e
...
...
@@ -20,6 +20,9 @@
DEFINE_int32
(
io_threadpool_size
,
100
,
"number of threads used for doing IO, default 100"
);
DEFINE_int32
(
dist_threadpool_size
,
0
,
"number of threads used for distributed executed."
);
namespace
paddle
{
namespace
framework
{
...
...
@@ -35,6 +38,10 @@ void ThreadPool::Init() {
if
(
threadpool_
.
get
()
==
nullptr
)
{
// TODO(Yancey1989): specify the max threads number
int
num_threads
=
std
::
thread
::
hardware_concurrency
();
if
(
FLAGS_dist_threadpool_size
>
0
)
{
num_threads
=
FLAGS_dist_threadpool_size
;
VLOG
(
1
)
<<
"set dist_threadpool_size to "
<<
num_threads
;
}
PADDLE_ENFORCE_GT
(
num_threads
,
0
);
threadpool_
.
reset
(
new
ThreadPool
(
num_threads
));
}
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
e398348e
...
...
@@ -60,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
inference_api_test
(
test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec
)
endif
()
if
(
WITH_ANAKIN
)
# only needed in CI
if
(
WITH_ANAKIN
AND WITH_GPU
)
# only needed in CI
# compile the libinference_anakin_api.a and anakin.so.
nv_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber
)
#nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
e398348e
...
...
@@ -170,6 +170,9 @@ function(op_library TARGET)
file
(
APPEND
${
pybind_file
}
"USE_OP(fake_dequantize_max_abs);
\n
"
)
elseif
(
${
TARGET
}
STREQUAL
"tensorrt_engine_op"
)
message
(
STATUS
"Pybind skips [tensorrt_engine_op], for this OP is only used in inference"
)
elseif
(
${
TARGET
}
STREQUAL
"fc"
)
# HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
file
(
APPEND
${
pybind_file
}
"USE_CPU_ONLY_OP(
${
TARGET
}
);
\n
"
)
else
()
file
(
APPEND
${
pybind_file
}
"USE_OP(
${
TARGET
}
);
\n
"
)
endif
()
...
...
@@ -300,12 +303,6 @@ op_library(channel_recv_op DEPS concurrency)
list
(
REMOVE_ITEM GENERAL_OPS
${
DEPS_OPS
}
)
# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
# Because the fully connected layer has only one MKLDNN's operator
if
(
NOT WITH_MKLDNN
)
list
(
REMOVE_ITEM GENERAL_OPS fc_op
)
endif
(
NOT WITH_MKLDNN
)
foreach
(
src
${
GENERAL_OPS
}
)
op_library
(
${
src
}
)
endforeach
()
...
...
paddle/fluid/operators/activation_op.cu
浏览文件 @
e398348e
...
...
@@ -26,6 +26,8 @@ namespace plat = paddle::platform;
act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>);
ops::grad_functor<double>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<plat::float16>>);
FOR_EACH_KERNEL_FUNCTOR
(
REGISTER_ACTIVATION_CUDA_KERNEL
);
paddle/fluid/operators/activation_op.h
浏览文件 @
e398348e
...
...
@@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
const
Out
out_conj
=
Eigen
::
numext
::
conj
(
out
);
dx
.
device
(
d
)
=
static_cast
<
T
>
(
0.5
)
*
dout
/
out_conj
;
dx
.
device
(
d
)
=
static_cast
<
T
>
(
0.5
)
*
dout
/
out
;
}
};
...
...
@@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
static_cast
<
T
>
(
factor
)
*
x
.
pow
(
static_cast
<
T
>
(
factor
-
static_cast
<
T
>
(
1
)
));
x
.
pow
(
static_cast
<
T
>
(
factor
)
-
static_cast
<
T
>
(
1
));
}
};
...
...
@@ -863,10 +862,11 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
T
b
=
static_cast
<
T
>
(
beta
);
auto
temp1
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
(
static_cast
<
T
>
(
-
b
eta
)
*
x
).
exp
());
auto
temp2
=
temp1
*
(
static_cast
<
T
>
(
1
)
-
(
b
eta
*
out
));
dx
.
device
(
d
)
=
dout
*
((
b
eta
*
out
)
+
temp2
);
(
static_cast
<
T
>
(
1
)
+
(
static_cast
<
T
>
(
-
b
)
*
x
).
exp
());
auto
temp2
=
temp1
*
(
static_cast
<
T
>
(
1
)
-
(
b
*
out
));
dx
.
device
(
d
)
=
dout
*
((
b
*
out
)
+
temp2
);
}
};
...
...
paddle/fluid/operators/assign_value_op.cu.cc
浏览文件 @
e398348e
...
...
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/assign_value_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
assign_value
,
ops
::
AssignValueKernel
<
int
>
,
ops
::
AssignValueKernel
<
float
>
);
ops
::
AssignValueKernel
<
float
>
,
ops
::
AssignValueKernel
<
plat
::
float16
>
);
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
e398348e
...
...
@@ -39,6 +39,27 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
static
constexpr
size_t
kCONV_CUDNN_WORKSPACE_LIMIT_BYTES
=
static_cast
<
size_t
>
(
1024
)
*
1024
*
1024
;
template
<
typename
T
,
typename
DeviceContext
>
// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
bool
EnableFp16
(
const
DeviceContext
&
dev_ctx
,
cudnnConvolutionDescriptor_t
cudnn_conv_desc
)
{
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
return
true
;
}
else
{
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
}
#endif
return
false
;
}
template
<
typename
T
>
class
CUDNNConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -128,27 +149,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnnConvolutionFwdAlgo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
// Currently tensor core is only enabled using this algo
if
(
EnableFp16
<
T
>
(
dev_ctx
,
cudnn_conv_desc
))
{
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
;
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
}
#endif
// get workspace size able to allocate
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
...
...
@@ -288,6 +296,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
else
{
data_algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
if
(
EnableFp16
<
T
>
(
dev_ctx
,
cudnn_conv_desc
))
{
data_algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
...
...
@@ -307,6 +318,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
else
{
filter_algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
}
if
(
EnableFp16
<
T
>
(
dev_ctx
,
cudnn_conv_desc
))
{
filter_algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
...
...
@@ -362,7 +376,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle
::
operators
::
CUDNNConvOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv2d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
);
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv3d
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvOpKernel
<
float
>
,
...
...
@@ -370,4 +385,5 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle
::
operators
::
CUDNNConvOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
conv3d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
);
paddle
::
operators
::
CUDNNConvGradOpKernel
<
double
>
,
paddle
::
operators
::
CUDNNConvGradOpKernel
<
plat
::
float16
>
)
paddle/fluid/operators/cross_entropy_op.cu
浏览文件 @
e398348e
...
...
@@ -13,12 +13,16 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
using
CUDACtx
=
paddle
::
platform
::
CUDADeviceContext
;
REGISTER_OP_CUDA_KERNEL
(
cross_entropy
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
cross_entropy_grad
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
double
>
);
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
double
>
,
ops
::
CrossEntropyOpKernel
<
CUDACtx
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
cross_entropy_grad
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
float
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
double
>
,
ops
::
CrossEntropyGradientOpKernel
<
CUDACtx
,
plat
::
float16
>
);
paddle/fluid/operators/distributed/variable_response.cc
浏览文件 @
e398348e
...
...
@@ -190,12 +190,15 @@ bool VariableResponse::ProcSerializedField(
#endif
}
VLOG
(
7
)
<<
"ProcSerializedField:"
<<
meta_
.
varname
()
<<
", type:"
<<
meta_
.
type
()
<<
std
::
endl
;
framework
::
DDim
dims
=
GetDims
(
meta_
.
dims
());
if
(
meta_
.
type
()
==
sendrecv
::
LOD_TENSOR
)
{
PADDLE_ENFORCE
(
meta_
.
lod_size
()
>=
0
,
"lod info should be got first!"
);
if
(
!
CopyLodTensorData
(
input
,
*
dev_ctx_
,
dims
,
num_bytes
))
{
return
false
;
}
return
true
;
}
...
...
@@ -206,7 +209,9 @@ bool VariableResponse::ProcSerializedField(
return
true
;
}
return
true
;
PADDLE_ENFORCE
(
"not supported var types:"
,
meta_
.
varname
(),
meta_
.
type
());
return
false
;
}
};
// namespace distributed
...
...
paddle/fluid/operators/elementwise_add_op.cu
浏览文件 @
e398348e
...
...
@@ -30,4 +30,5 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseAddGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/elementwise_div_op.cu
浏览文件 @
e398348e
...
...
@@ -14,19 +14,24 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_div_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
elementwise_div
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseDivKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
elementwise_div_grad
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseDivGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
plat
::
float16
>
);
paddle/fluid/operators/elementwise_mul_op.cu
浏览文件 @
e398348e
...
...
@@ -14,19 +14,25 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
elementwise_mul
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseMulKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
elementwise_mul_grad
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
ElementwiseMulGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/elementwise_op_function.h
浏览文件 @
e398348e
...
...
@@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
int
j
=
blockIdx
.
x
;
int
i
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
T
val
=
0
;
T
val
(
0
)
;
do
{
int
x_offset
=
i
*
w
+
j
;
...
...
@@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
int
tid
=
threadIdx
.
x
;
int
j
=
blockIdx
.
x
;
T
val
=
0
;
T
val
(
0
)
;
int
ttid
=
tid
;
while
(
true
)
{
...
...
paddle/fluid/operators/elementwise_sub_op.cu
浏览文件 @
e398348e
...
...
@@ -14,19 +14,25 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/elementwise_sub_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
elementwise_sub
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
ElementwiseSubKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
elementwise_sub_grad
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
ElementwiseSubGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/fc_mkldnn_op.cc
浏览文件 @
e398348e
...
...
@@ -125,13 +125,16 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
auto
w
=
ctx
.
Input
<
Tensor
>
(
"W"
);
auto
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
PADDLE_ENFORCE
(
input
->
dims
().
size
()
==
2
||
input
->
dims
().
size
()
==
4
,
"Input must be with 2 or 4 dimensions, i.e. NCHW"
);
// TODO(intel friends): the native weight format is io,
// but the mkldnn weight format is oihw, which may need be transposed.
PADDLE_ENFORCE
(
w
->
dims
().
size
()
==
2
||
w
->
dims
().
size
()
==
4
,
"Weights must be with 2 or 4 dimensions, i.e. OI or OIHW"
);
bool
with_bias
=
ctx
.
Attr
<
bool
>
(
"bias_attr"
)
;
bool
with_bias
=
bias
!=
nullptr
;
MKLDNNMD
<
Tensor
>
md
(
input
,
w
,
with_bias
);
std
::
shared_ptr
<
mkldnn
::
inner_product_forward
::
primitive_desc
>
pd
=
...
...
@@ -154,6 +157,7 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto
dst_memory
=
mem
.
dst
(
output_data
);
auto
src_memory
=
mem
.
src
(
input_data
);
auto
weights_memory
=
mem
.
weights
(
w_data
);
// TODO(intel friends): bias memory should also be obtain from bias->data()
auto
bias_memory
=
mem
.
bias
();
auto
forward
=
with_bias
?
mkldnn
::
inner_product_forward
(
...
...
@@ -216,7 +220,8 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
const
Tensor
*
out_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
const
T
*
out_grad_data
=
out_grad
->
data
<
T
>
();
bool
with_bias
=
ctx
.
Attr
<
bool
>
(
"bias_attr"
);
auto
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
bool
with_bias
=
bias
!=
nullptr
;
MKLDNNMD
<
Tensor
>
md
(
input
,
w
,
with_bias
);
MKLDNNMemory
mem
(
&
md
,
mkldnn_engine
);
...
...
paddle/fluid/operators/fc_op.cc
浏览文件 @
e398348e
...
...
@@ -14,6 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/fc_op.h"
#include <vector>
#include "paddle/fluid/operators/math/blas.h"
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
operators
{
...
...
@@ -25,16 +28,24 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
"Out(Output) of Fully Connected should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"W"
),
"W(Input) of Fully Connected should not be null."
);
// NCHW
auto
in_dims
=
ctx
->
GetInputDim
(
"Input"
);
// IO, I=C*H*W
auto
w_dims
=
ctx
->
GetInputDim
(
"W"
);
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
w_dims
[
1
]});
if
(
ctx
->
HasInput
(
"Bias"
))
{
auto
bias_dims
=
ctx
->
GetInputDim
(
"Bias"
);
PADDLE_ENFORCE_EQ
(
bias_dims
[
0
],
1
,
"The shape of Bias must be [1, dim]."
);
PADDLE_ENFORCE_EQ
(
bias_dims
[
1
],
w_dims
[
1
],
"The shape of Bias must be [1, dim]."
);
}
PADDLE_ENFORCE
(
in_dims
.
size
()
==
2
||
in_dims
.
size
()
==
4
,
"Fully Connected input should be 2-D or 4-D tensor."
);
PADDLE_ENFORCE
(
w_dims
.
size
()
==
2
||
w_dims
.
size
()
==
4
,
"Fully Connected input should be 2-D or 4-D tensor."
);
PADDLE_ENFORCE_EQ
(
w_dims
.
size
(),
2UL
,
"Fully Connected input should be 2-D tensor."
);
PADDLE_ENFORCE_EQ
(
framework
::
product
(
in_dims
)
/
in_dims
[
0
],
w_dims
[
0
],
"Fully Connected input and weigth size do not match."
);
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
output_shape
));
ctx
->
ShareLoD
(
"Input"
,
"Out"
);
...
...
@@ -42,9 +53,12 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
framework
::
OpKernelType
FCOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library
{
framework
::
LibraryType
::
kMKLDNN
};
framework
::
DataLayout
layout
{
framework
::
DataLayout
::
kMKLDNN
};
framework
::
LibraryType
library
=
framework
::
LibraryType
::
kPlain
;
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
if
(
ctx
.
Attr
<
bool
>
(
"use_mkldnn"
))
{
library
=
framework
::
LibraryType
::
kMKLDNN
;
layout
=
framework
::
DataLayout
::
kMKLDNN
;
}
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
()),
ctx
.
GetPlace
(),
layout
,
library
);
...
...
@@ -60,27 +74,39 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"W"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"W"
),
w_dims
);
}
if
(
ctx
->
HasInput
(
"Bias"
))
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Bias"
)),
"Should have bias grad"
);
auto
bias_dims
=
ctx
->
GetInputDim
(
"Bias"
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Bias"
),
bias_dims
);
}
}
framework
::
OpKernelType
FCOpGrad
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library
{
framework
::
LibraryType
::
kMKLDNN
};
framework
::
DataLayout
layout
{
framework
::
DataLayout
::
kMKLDNN
};
framework
::
LibraryType
library
=
framework
::
LibraryType
::
kPlain
;
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
if
(
ctx
.
Attr
<
bool
>
(
"use_mkldnn"
))
{
library
=
framework
::
LibraryType
::
kMKLDNN
;
layout
=
framework
::
DataLayout
::
kMKLDNN
;
}
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Input"
)
->
type
()),
ctx
.
GetPlace
(),
layout
,
library
);
}
void
FCOpMaker
::
Make
()
{
AddInput
(
"Input"
,
"(Tensor) The input tensor of fully connected operator. "
);
AddInput
(
"W"
,
"(Tensor), The second input tensor of fc op."
);
AddInput
(
"Input"
,
"(Tensor), The input tensor of fully connected operator with format "
"(NCHW). "
);
AddInput
(
"W"
,
"(Tensor), The weight fc op with shape (I, O)."
);
AddInput
(
"Bias"
,
"(Tensor, optional) Bias vector with shape (1 x O"
)
.
AsDispensable
();
AddOutput
(
"Out"
,
"(Tensor) The output tensor of fully connected operator. "
);
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"bias_attr"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
Fully Connected Operator.
...
...
@@ -94,9 +120,47 @@ void FCOpMaker::Make() {
)DOC"
);
}
template
<
typename
T
>
class
FCOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()),
"It must use CPUPlace."
);
auto
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
auto
w
=
ctx
.
Input
<
Tensor
>
(
"W"
);
auto
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
output
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
auto
in_dims
=
input
->
dims
();
auto
w_dims
=
w
->
dims
();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>();
auto
blas
=
math
::
GetBlas
<
platform
::
CPUDeviceContext
,
T
>
(
dev_ctx
);
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
w_data
=
w
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
in_dims
[
0
],
w_dims
[
1
],
w_dims
[
0
],
static_cast
<
T
>
(
1
),
input_data
,
w_data
,
static_cast
<
T
>
(
0
),
output_data
);
if
(
bias
)
{
const
T
*
bias_data
=
bias
->
data
<
T
>
();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
#endif
for
(
int
bs
=
0
;
bs
<
in_dims
[
0
];
bs
++
)
{
blas
.
AXPY
(
w_dims
[
1
],
static_cast
<
T
>
(
1
),
bias_data
,
output_data
+
bs
*
w_dims
[
1
]);
}
}
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OPERATOR
(
fc
,
paddle
::
operators
::
FCOp
,
paddle
::
operators
::
FCOpMaker
,
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fc
,
ops
::
FCOp
,
ops
::
FCOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
fc_grad
,
paddle
::
operators
::
FCOpGrad
);
REGISTER_OPERATOR
(
fc_grad
,
ops
::
FCOpGrad
);
REGISTER_OP_CPU_KERNEL
(
fc
,
ops
::
FCOpKernel
<
float
>
,
ops
::
FCOpKernel
<
double
>
);
paddle/fluid/operators/fill_constant_op.cc
浏览文件 @
e398348e
...
...
@@ -12,48 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
class
FillConstant
InferShape
:
public
framework
::
InferShapeBase
{
class
FillConstant
Op
:
public
framework
::
OperatorWithKernel
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of FillConstantOp should not be null."
);
auto
&
shape
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape"
);
auto
&
shape
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"shape"
);
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
shape
));
}
};
class
FillConstantOp
:
public
framework
::
OperatorBase
{
public:
using
framework
::
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
auto
data_type
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
Attr
<
int
>
(
"dtype"
));
auto
value
=
Attr
<
float
>
(
"value"
);
auto
force_cpu
=
Attr
<
bool
>
(
"force_cpu"
);
auto
&
out
=
*
scope
.
FindVar
(
Output
(
"Out"
))
->
GetMutable
<
framework
::
LoDTensor
>
();
out
.
Resize
(
framework
::
make_ddim
(
Attr
<
std
::
vector
<
int
>>
(
"shape"
)));
if
(
force_cpu
)
{
auto
cpu
=
platform
::
CPUPlace
();
out
.
mutable_data
(
cpu
,
framework
::
ToTypeIndex
(
data_type
));
}
else
{
out
.
mutable_data
(
dev_place
,
framework
::
ToTypeIndex
(
data_type
));
}
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
dev_place
);
math
::
set_constant
(
dev_ctx
,
&
out
,
value
);
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
ctx
.
Attr
<
int
>
(
"dtype"
)),
ctx
.
device_context
());
}
};
...
...
@@ -87,6 +67,11 @@ Fill up a variable with specified constant value.
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
fill_constant
,
ops
::
FillConstantOp
,
ops
::
FillConstantInferShape
,
ops
::
FillConstantOpMaker
,
REGISTER_OPERATOR
(
fill_constant
,
ops
::
FillConstantOp
,
ops
::
FillConstantOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
fill_constant
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
)
paddle/fluid/operators/fill_constant_op.cu.cc
0 → 100644
浏览文件 @
e398348e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
fill_constant
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
FillConstantOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
)
paddle/fluid/operators/fill_constant_op.h
0 → 100644
浏览文件 @
e398348e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
FillConstantOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
ctx
.
Attr
<
int
>
(
"dtype"
));
auto
value
=
ctx
.
Attr
<
float
>
(
"value"
);
auto
force_cpu
=
ctx
.
Attr
<
bool
>
(
"force_cpu"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
out
->
Resize
(
framework
::
make_ddim
(
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"shape"
)));
if
(
force_cpu
)
{
auto
cpu
=
platform
::
CPUPlace
();
out
->
mutable_data
(
cpu
,
framework
::
ToTypeIndex
(
data_type
));
}
else
{
out
->
mutable_data
(
ctx
.
GetPlace
(),
framework
::
ToTypeIndex
(
data_type
));
}
math
::
set_constant
(
ctx
.
template
device_context
<
DeviceContext
>(),
out
,
value
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fill_op.cc
浏览文件 @
e398348e
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -69,7 +70,6 @@ class FillOp : public framework::OperatorBase {
framework
::
VisitDataType
(
dtype
,
FillOpVisitor
(
&
tensor
,
Attr
<
std
::
vector
<
float
>>
(
"value"
)));
if
(
!
force_cpu
&&
platform
::
is_gpu_place
(
place
))
{
// Copy tensor to out
platform
::
DeviceContextPool
&
pool
=
...
...
paddle/fluid/operators/gaussian_random_op.cu
浏览文件 @
e398348e
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#include <thrust/transform.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -60,6 +61,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
}
// namespace operators
}
// namespace paddle
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
gaussian_random
,
paddle
::
operators
::
GPUGaussianRandomKernel
<
float
>
,
paddle
::
operators
::
GPUGaussianRandomKernel
<
double
>
);
...
...
paddle/fluid/operators/gru_op.cc
浏览文件 @
e398348e
...
...
@@ -14,6 +14,11 @@ limitations under the License. */
#include "paddle/fluid/operators/gru_op.h"
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h"
DECLARE_int32
(
paddle_num_threads
);
namespace
paddle
{
namespace
operators
{
...
...
@@ -211,6 +216,158 @@ class GRUGradOp : public framework::OperatorWithKernel {
}
};
template
<
typename
T
>
class
GRUCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
auto
*
input
=
context
.
Input
<
LoDTensor
>
(
"Input"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
const
T
*
weight_data
=
weight
->
data
<
T
>
();
auto
*
bias
=
context
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
batch_gate
=
context
.
Output
<
LoDTensor
>
(
"BatchGate"
);
batch_gate
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
batch_reset_hidden_prev
=
context
.
Output
<
LoDTensor
>
(
"BatchResetHiddenPrev"
);
batch_reset_hidden_prev
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
batch_hidden
=
context
.
Output
<
LoDTensor
>
(
"BatchHidden"
);
batch_hidden
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
hidden
=
context
.
Output
<
LoDTensor
>
(
"Hidden"
);
hidden
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
hidden_dims
=
hidden
->
dims
();
bool
is_reverse
=
context
.
Attr
<
bool
>
(
"is_reverse"
);
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
to_batch
(
dev_ctx
,
*
input
,
batch_gate
,
true
,
is_reverse
);
if
(
bias
)
{
math
::
RowwiseAdd
<
DeviceContext
,
T
>
add_bias
;
add_bias
(
dev_ctx
,
*
batch_gate
,
*
bias
,
batch_gate
);
}
int
frame_size
=
hidden_dims
[
1
];
math
::
GRUMetaValue
<
T
>
gru_value
;
gru_value
.
gate_weight
=
const_cast
<
T
*>
(
weight_data
);
gru_value
.
state_weight
=
const_cast
<
T
*>
(
weight_data
+
2
*
frame_size
*
frame_size
);
Tensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
// Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs
// to reorder.
ReorderInitState
<
DeviceContext
,
T
>
(
context
.
template
device_context
<
DeviceContext
>(),
*
h0
,
order
,
&
ordered_h0
,
true
);
gru_value
.
prev_out_value
=
ordered_h0
.
data
<
T
>
();
}
else
{
gru_value
.
prev_out_value
=
nullptr
;
}
auto
batch_starts
=
batch_gate
->
lod
()[
0
];
size_t
seq_len
=
batch_starts
.
size
()
-
1
;
auto
active_node
=
math
::
detail
::
GetActivationType
(
context
.
Attr
<
std
::
string
>
(
"activation"
));
auto
active_gate
=
math
::
detail
::
GetActivationType
(
context
.
Attr
<
std
::
string
>
(
"gate_activation"
));
#ifdef PADDLE_WITH_MKLML
// use MKL packed to speedup GEMM
if
(
FLAGS_paddle_num_threads
>=
4
)
{
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
T
*
packed_gate
=
blas
.
GEMM_ALLOC
(
CblasBMatrix
,
1
/*height of C*/
,
frame_size
*
2
/*width of weight*/
,
frame_size
/*height of height*/
);
PADDLE_ENFORCE
(
packed_gate
);
blas
.
GEMM_PACK
(
CblasBMatrix
,
CblasNoTrans
,
1
/*cur bs?*/
,
frame_size
*
2
,
frame_size
,
T
(
1.0
),
gru_value
.
gate_weight
,
frame_size
*
2
,
packed_gate
);
T
*
packed_state
=
blas
.
GEMM_ALLOC
(
CblasBMatrix
,
1
/*height of C*/
,
frame_size
/*width of weight*/
,
frame_size
/*height of height*/
);
PADDLE_ENFORCE
(
packed_state
);
blas
.
GEMM_PACK
(
CblasBMatrix
,
CblasNoTrans
,
1
/*cur bs?*/
,
frame_size
,
frame_size
,
T
(
1.0
),
gru_value
.
state_weight
,
frame_size
,
packed_state
);
for
(
size_t
n
=
0
;
n
<
seq_len
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batch_gate
->
Slice
(
bstart
,
bend
);
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
if
(
gru_value
.
prev_out_value
)
{
blas
.
GEMM_COMPUTE
(
CblasNoTrans
,
CblasPacked
,
cur_batch_size
,
frame_size
*
2
,
frame_size
,
gru_value
.
prev_out_value
,
frame_size
,
packed_gate
,
frame_size
*
2
,
T
(
1
),
gru_value
.
gate_value
,
frame_size
*
3
);
}
math
::
detail
::
forward_reset_output
(
math
::
detail
::
forward
::
gru_resetOutput
<
T
>
(),
gru_value
,
frame_size
,
cur_batch_size
,
active_gate
);
if
(
gru_value
.
prev_out_value
)
{
blas
.
GEMM_COMPUTE
(
CblasNoTrans
,
CblasPacked
,
cur_batch_size
,
frame_size
,
frame_size
,
gru_value
.
reset_output_value
,
frame_size
,
packed_state
,
frame_size
,
T
(
1
),
gru_value
.
gate_value
+
frame_size
*
2
,
frame_size
*
3
);
}
math
::
detail
::
forward_final_output
(
math
::
detail
::
forward
::
gru_finalOutput
<
T
>
(),
gru_value
,
frame_size
,
cur_batch_size
,
active_node
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
blas
.
GEMM_FREE
(
packed_gate
);
blas
.
GEMM_FREE
(
packed_state
);
}
else
{
#endif
for
(
size_t
n
=
0
;
n
<
seq_len
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batch_gate
->
Slice
(
bstart
,
bend
);
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
#ifdef PADDLE_WITH_MKLML
}
#endif
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
batch_hidden
->
set_lod
(
batch_gate
->
lod
());
to_seq
(
dev_ctx
,
*
batch_hidden
,
hidden
);
}
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
BatchCompute
(
context
);
}
};
}
// namespace operators
}
// namespace paddle
...
...
@@ -218,9 +375,8 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR
(
gru
,
ops
::
GRUOp
,
ops
::
GRUOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
gru_grad
,
ops
::
GRUGradOp
);
REGISTER_OP_CPU_KERNEL
(
gru
,
ops
::
GRUKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
GRUKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
gru
,
ops
::
GRUCPUKernel
<
float
>
,
ops
::
GRUCPUKernel
<
double
>
);
REGISTER_OP_CPU_KERNEL
(
gru_grad
,
ops
::
GRUGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
GRUGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/gru_op.cu.cc
浏览文件 @
e398348e
...
...
@@ -14,6 +14,96 @@ limitations under the License. */
#include "paddle/fluid/operators/gru_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
GRUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
auto
*
input
=
context
.
Input
<
LoDTensor
>
(
"Input"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
const
T
*
weight_data
=
weight
->
data
<
T
>
();
auto
*
bias
=
context
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
batch_gate
=
context
.
Output
<
LoDTensor
>
(
"BatchGate"
);
batch_gate
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
batch_reset_hidden_prev
=
context
.
Output
<
LoDTensor
>
(
"BatchResetHiddenPrev"
);
batch_reset_hidden_prev
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
batch_hidden
=
context
.
Output
<
LoDTensor
>
(
"BatchHidden"
);
batch_hidden
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
hidden
=
context
.
Output
<
LoDTensor
>
(
"Hidden"
);
hidden
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
hidden_dims
=
hidden
->
dims
();
bool
is_reverse
=
context
.
Attr
<
bool
>
(
"is_reverse"
);
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
to_batch
(
dev_ctx
,
*
input
,
batch_gate
,
true
,
is_reverse
);
if
(
bias
)
{
math
::
RowwiseAdd
<
DeviceContext
,
T
>
add_bias
;
add_bias
(
dev_ctx
,
*
batch_gate
,
*
bias
,
batch_gate
);
}
int
frame_size
=
hidden_dims
[
1
];
math
::
GRUMetaValue
<
T
>
gru_value
;
gru_value
.
gate_weight
=
const_cast
<
T
*>
(
weight_data
);
gru_value
.
state_weight
=
const_cast
<
T
*>
(
weight_data
+
2
*
frame_size
*
frame_size
);
Tensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
// Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs
// to reorder.
ReorderInitState
<
DeviceContext
,
T
>
(
context
.
template
device_context
<
DeviceContext
>(),
*
h0
,
order
,
&
ordered_h0
,
true
);
gru_value
.
prev_out_value
=
ordered_h0
.
data
<
T
>
();
}
else
{
gru_value
.
prev_out_value
=
nullptr
;
}
auto
batch_starts
=
batch_gate
->
lod
()[
0
];
size_t
num_batch
=
batch_starts
.
size
()
-
1
;
auto
active_node
=
math
::
detail
::
GetActivationType
(
context
.
Attr
<
std
::
string
>
(
"activation"
));
auto
active_gate
=
math
::
detail
::
GetActivationType
(
context
.
Attr
<
std
::
string
>
(
"gate_activation"
));
for
(
size_t
n
=
0
;
n
<
num_batch
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batch_gate
->
Slice
(
bstart
,
bend
);
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
batch_hidden
->
set_lod
(
batch_gate
->
lod
());
to_seq
(
dev_ctx
,
*
batch_hidden
,
hidden
);
}
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
BatchCompute
(
context
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
gru
,
ops
::
GRUKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
...
...
paddle/fluid/operators/gru_op.h
浏览文件 @
e398348e
...
...
@@ -37,90 +37,6 @@ inline void ReorderInitState(const DeviceContext& ctx,
row_shuffle
(
ctx
,
src
,
index_lod
,
dst
,
indexed_src
);
}
template
<
typename
DeviceContext
,
typename
T
>
class
GRUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
BatchCompute
(
const
framework
::
ExecutionContext
&
context
)
const
{
auto
*
input
=
context
.
Input
<
LoDTensor
>
(
"Input"
);
auto
*
h0
=
context
.
Input
<
Tensor
>
(
"H0"
);
auto
*
weight
=
context
.
Input
<
Tensor
>
(
"Weight"
);
const
T
*
weight_data
=
weight
->
data
<
T
>
();
auto
*
bias
=
context
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
batch_gate
=
context
.
Output
<
LoDTensor
>
(
"BatchGate"
);
batch_gate
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
batch_reset_hidden_prev
=
context
.
Output
<
LoDTensor
>
(
"BatchResetHiddenPrev"
);
batch_reset_hidden_prev
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
batch_hidden
=
context
.
Output
<
LoDTensor
>
(
"BatchHidden"
);
batch_hidden
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
hidden
=
context
.
Output
<
LoDTensor
>
(
"Hidden"
);
hidden
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
hidden_dims
=
hidden
->
dims
();
bool
is_reverse
=
context
.
Attr
<
bool
>
(
"is_reverse"
);
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
to_batch
(
dev_ctx
,
*
input
,
batch_gate
,
true
,
is_reverse
);
if
(
bias
)
{
math
::
RowwiseAdd
<
DeviceContext
,
T
>
add_bias
;
add_bias
(
dev_ctx
,
*
batch_gate
,
*
bias
,
batch_gate
);
}
int
frame_size
=
hidden_dims
[
1
];
math
::
GRUMetaValue
<
T
>
gru_value
;
gru_value
.
gate_weight
=
const_cast
<
T
*>
(
weight_data
);
gru_value
.
state_weight
=
const_cast
<
T
*>
(
weight_data
+
2
*
frame_size
*
frame_size
);
Tensor
ordered_h0
;
framework
::
Vector
<
size_t
>
order
(
batch_gate
->
lod
()[
2
]);
if
(
h0
)
{
// Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs
// to reorder.
ReorderInitState
<
DeviceContext
,
T
>
(
context
.
template
device_context
<
DeviceContext
>(),
*
h0
,
order
,
&
ordered_h0
,
true
);
gru_value
.
prev_out_value
=
ordered_h0
.
data
<
T
>
();
}
else
{
gru_value
.
prev_out_value
=
nullptr
;
}
auto
batch_starts
=
batch_gate
->
lod
()[
0
];
size_t
num_batch
=
batch_starts
.
size
()
-
1
;
auto
active_node
=
math
::
detail
::
GetActivationType
(
context
.
Attr
<
std
::
string
>
(
"activation"
));
auto
active_gate
=
math
::
detail
::
GetActivationType
(
context
.
Attr
<
std
::
string
>
(
"gate_activation"
));
for
(
size_t
n
=
0
;
n
<
num_batch
;
n
++
)
{
int
bstart
=
static_cast
<
int
>
(
batch_starts
[
n
]);
int
bend
=
static_cast
<
int
>
(
batch_starts
[
n
+
1
]);
int
cur_batch_size
=
bend
-
bstart
;
Tensor
gate_t
=
batch_gate
->
Slice
(
bstart
,
bend
);
Tensor
reset_hidden_prev_t
=
batch_reset_hidden_prev
->
Slice
(
bstart
,
bend
);
Tensor
hidden_t
=
batch_hidden
->
Slice
(
bstart
,
bend
);
gru_value
.
output_value
=
hidden_t
.
data
<
T
>
();
gru_value
.
gate_value
=
gate_t
.
data
<
T
>
();
gru_value
.
reset_output_value
=
reset_hidden_prev_t
.
data
<
T
>
();
math
::
GRUUnitFunctor
<
DeviceContext
,
T
>::
compute
(
dev_ctx
,
gru_value
,
frame_size
,
cur_batch_size
,
active_node
,
active_gate
);
gru_value
.
prev_out_value
=
gru_value
.
output_value
;
}
math
::
Batch2LoDTensorFunctor
<
DeviceContext
,
T
>
to_seq
;
batch_hidden
->
set_lod
(
batch_gate
->
lod
());
to_seq
(
dev_ctx
,
*
batch_hidden
,
hidden
);
}
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
BatchCompute
(
context
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
GRUGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
e398348e
...
...
@@ -123,8 +123,11 @@ void ListenAndServOp::RunSyncLoop(
optimize_prepared
.
begin
(),
std
::
shared_ptr
<
framework
::
ExecutorPrepareContext
>
(
nullptr
));
// Trainers will get all parameters from pserver in the
// startup program, so we will wait RequestGet first
rpc_service_
->
SetCond
(
distributed
::
kRequestGet
);
rpc_service_
->
WaitBarrier
(
distributed
::
kRequestGet
);
rpc_service_
->
ResetBarrierCounter
();
while
(
true
)
{
rpc_service_
->
Profiler
().
OneStep
();
// Get from multiple trainers, we don't care about the order in which
...
...
paddle/fluid/operators/math/blas.h
浏览文件 @
e398348e
...
...
@@ -90,6 +90,25 @@ class Blas {
void
GEMM
(
bool
transA
,
bool
transB
,
int
M
,
int
N
,
int
K
,
T
alpha
,
const
T
*
A
,
int
lda
,
const
T
*
B
,
int
ldb
,
T
beta
,
T
*
C
,
int
ldc
)
const
;
#ifdef PADDLE_WITH_MKLML
template
<
typename
T
>
T
*
GEMM_ALLOC
(
const
CBLAS_IDENTIFIER
id
,
const
int
M
,
const
int
N
,
const
int
K
)
const
;
template
<
typename
T
>
void
GEMM_PACK
(
const
CBLAS_IDENTIFIER
id
,
const
CBLAS_TRANSPOSE
trans
,
int
M
,
int
N
,
int
K
,
const
T
alpha
,
const
T
*
src
,
const
int
ld
,
T
*
dst
)
const
;
template
<
typename
T
>
void
GEMM_COMPUTE
(
int
transA
,
int
transB
,
int
M
,
int
N
,
int
K
,
const
T
*
A
,
const
int
lda
,
const
T
*
B
,
const
int
ldb
,
T
beta
,
T
*
C
,
const
int
ldc
)
const
;
template
<
typename
T
>
void
GEMM_FREE
(
T
*
data
)
const
;
#endif
template
<
typename
T
>
void
MatMul
(
const
framework
::
Tensor
&
mat_a
,
bool
trans_a
,
const
framework
::
Tensor
&
mat_b
,
bool
trans_b
,
T
alpha
,
...
...
@@ -146,6 +165,28 @@ class BlasT : private Blas<DeviceContext> {
Base
()
->
template
GEMM
<
T
>(
args
...);
}
#ifdef PADDLE_WITH_MKLML
template
<
typename
...
ARGS
>
T
*
GEMM_ALLOC
(
ARGS
...
args
)
const
{
return
Base
()
->
template
GEMM_ALLOC
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
GEMM_PACK
(
ARGS
...
args
)
const
{
Base
()
->
template
GEMM_PACK
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
GEMM_COMPUTE
(
ARGS
...
args
)
const
{
Base
()
->
template
GEMM_COMPUTE
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
GEMM_FREE
(
ARGS
...
args
)
const
{
Base
()
->
template
GEMM_FREE
<
T
>(
args
...);
}
#endif
template
<
typename
...
ARGS
>
void
MatMul
(
ARGS
...
args
)
const
{
Base
()
->
template
MatMul
<
T
>(
args
...);
...
...
paddle/fluid/operators/math/blas_impl.h
浏览文件 @
e398348e
...
...
@@ -31,6 +31,26 @@ struct CBlas<float> {
platform
::
dynload
::
cblas_sgemm
(
args
...);
}
template
<
typename
...
ARGS
>
static
float
*
GEMM_ALLOC
(
ARGS
...
args
)
{
return
platform
::
dynload
::
cblas_sgemm_alloc
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMM_PACK
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_sgemm_pack
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMM_COMPUTE
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_sgemm_compute
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMM_FREE
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_sgemm_free
(
args
...);
}
#ifdef PADDLE_WITH_LIBXSMM
template
<
typename
...
ARGS
>
static
void
SMM_GEMM
(
ARGS
...
args
)
{
...
...
@@ -71,6 +91,26 @@ struct CBlas<double> {
platform
::
dynload
::
cblas_dgemm
(
args
...);
}
template
<
typename
...
ARGS
>
static
double
*
GEMM_ALLOC
(
ARGS
...
args
)
{
return
platform
::
dynload
::
cblas_dgemm_alloc
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMM_PACK
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_dgemm_pack
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMM_COMPUTE
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_dgemm_compute
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMM_FREE
(
ARGS
...
args
)
{
platform
::
dynload
::
cblas_dgemm_free
(
args
...);
}
#ifdef PADDLE_WITH_LIBXSMM
template
<
typename
...
ARGS
>
static
void
SMM_GEMM
(
ARGS
...
args
)
{
...
...
@@ -224,6 +264,41 @@ inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
beta
,
C
,
ldc
);
}
#ifdef PADDLE_WITH_MKLML
template
<
>
template
<
typename
T
>
T
*
Blas
<
platform
::
CPUDeviceContext
>::
GEMM_ALLOC
(
const
CBLAS_IDENTIFIER
id
,
const
int
M
,
const
int
N
,
const
int
K
)
const
{
return
CBlas
<
T
>::
GEMM_ALLOC
(
id
,
M
,
N
,
K
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM_PACK
(
const
CBLAS_IDENTIFIER
id
,
const
CBLAS_TRANSPOSE
trans
,
int
M
,
int
N
,
int
K
,
const
T
alpha
,
const
T
*
src
,
const
int
ld
,
T
*
dst
)
const
{
CBlas
<
T
>::
GEMM_PACK
(
CblasRowMajor
,
id
,
trans
,
M
,
N
,
K
,
alpha
,
src
,
ld
,
dst
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM_COMPUTE
(
int
transA
,
int
transB
,
int
M
,
int
N
,
int
K
,
const
T
*
A
,
const
int
lda
,
const
T
*
B
,
const
int
ldb
,
T
beta
,
T
*
C
,
const
int
ldc
)
const
{
CBlas
<
T
>::
GEMM_COMPUTE
(
CblasRowMajor
,
transA
,
transB
,
M
,
N
,
K
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM_FREE
(
T
*
data
)
const
{
CBlas
<
T
>::
GEMM_FREE
(
data
);
}
#endif
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMM
(
CBLAS_TRANSPOSE
transA
,
...
...
paddle/fluid/operators/math/cross_entropy.cu
浏览文件 @
e398348e
...
...
@@ -15,11 +15,25 @@ limitations under the License. */
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
template
<
typename
T
>
HOSTDEVICE
T
log
(
const
T
&
val
)
{
return
std
::
log
(
val
);
}
template
<
>
HOSTDEVICE
platform
::
float16
log
(
const
platform
::
float16
&
val
)
{
// strage bug, hlog is not exists.
return
static_cast
<
float16
>
(
0
);
// half tmp = static_cast<half>(val);
// return static_cast<platform::float16>(hlog(tmp));
}
namespace
{
template
<
typename
T
>
__global__
void
CrossEntropyKernel
(
T
*
Y
,
const
T
*
X
,
const
int64_t
*
label
,
...
...
@@ -35,12 +49,12 @@ template <typename T>
__global__
void
SoftCrossEntropyKernel
(
T
*
Y
,
const
T
*
X
,
const
T
*
label
,
const
int
class_num
)
{
int
tid
=
threadIdx
.
x
;
T
val
=
0
;
T
val
(
0
)
;
int
idx
=
blockIdx
.
x
*
class_num
+
tid
;
int
end
=
blockIdx
.
x
*
class_num
+
class_num
;
for
(;
idx
<
end
;
idx
+=
blockDim
.
x
)
{
val
+=
math
::
TolerableValue
<
T
>
()(
std
::
log
(
X
[
idx
]))
*
label
[
idx
];
val
+=
math
::
TolerableValue
<
T
>
()(
log
(
X
[
idx
]))
*
label
[
idx
];
}
val
=
paddle
::
platform
::
reduceSum
(
val
,
tid
,
blockDim
.
x
);
...
...
@@ -84,6 +98,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/cross_entropy.h
浏览文件 @
e398348e
...
...
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <limits>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace
paddle
{
...
...
@@ -33,6 +35,21 @@ struct TolerableValue {
}
};
// float16 value clip behave different.
using
paddle
::
platform
::
float16
;
using
paddle
::
platform
::
isfinite
;
template
<
>
struct
TolerableValue
<
float16
>
{
HOSTDEVICE
float16
operator
()(
const
float16
&
x
)
const
{
if
(
isfinite
(
x
))
return
x
;
else
if
(
x
>
static_cast
<
float16
>
(
0
))
return
std
::
numeric_limits
<
float16
>::
max
();
else
return
std
::
numeric_limits
<
float16
>::
min
();
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
CrossEntropyFunctor
{
public:
...
...
paddle/fluid/operators/math/selected_rows_functor.cu
浏览文件 @
e398348e
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -76,6 +77,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
{
template
<
typename
T
,
int
block_size
>
...
...
@@ -120,7 +122,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
auto
*
out_data
=
output
->
data
<
T
>
();
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
functor
;
functor
(
context
,
output
,
0.0
);
functor
(
context
,
output
,
static_cast
<
T
>
(
0
)
);
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
...
...
@@ -138,6 +140,8 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
float
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
<
typename
T
>
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
T
>
{
...
...
@@ -177,6 +181,8 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddTo
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
{
template
<
typename
T
,
int
block_size
>
...
...
@@ -229,6 +235,8 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
SelectedRowsAddToTensor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
namespace
scatter
{
...
...
@@ -276,7 +284,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
context
.
GetPlace
());
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
,
out
.
mutable_value
(),
0.0
);
constant_functor
(
context
,
out
.
mutable_value
(),
static_cast
<
T
>
(
0
)
);
auto
*
out_data
=
out
.
mutable_value
()
->
data
<
T
>
();
auto
*
input_data
=
input
.
value
().
data
<
T
>
();
...
...
@@ -300,6 +308,7 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
double
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
int
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
int64_t
>;
template
struct
MergeAdd
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
<
typename
T
,
int
block_size
>
__global__
void
UpdateToTensorKernel
(
const
T
*
selected_rows
,
...
...
paddle/fluid/operators/math/softmax.cu
浏览文件 @
e398348e
...
...
@@ -94,12 +94,15 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
template
class
SoftmaxCUDNNFunctor
<
platform
::
float16
>;
template
class
SoftmaxCUDNNFunctor
<
float
>;
template
class
SoftmaxCUDNNFunctor
<
double
>;
template
class
SoftmaxGradCUDNNFunctor
<
platform
::
float16
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxFunctor
<
platform
::
CUDADeviceContext
,
double
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
platform
::
float16
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
float
>;
template
class
SoftmaxGradFunctor
<
platform
::
CUDADeviceContext
,
double
>;
...
...
paddle/fluid/operators/mean_op.cu
浏览文件 @
e398348e
...
...
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/mean_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
mean
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
MeanKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
mean_grad
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
MeanGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/mean_op.h
浏览文件 @
e398348e
...
...
@@ -55,7 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
IG
->
mutable_data
<
T
>
(
context
.
GetPlace
());
T
ig_size
=
static_cast
<
T
>
(
IG
->
numel
());
Eigen
::
DSizes
<
int
,
1
>
bcast
(
ig_size
);
Eigen
::
DSizes
<
int
,
1
>
bcast
(
static_cast
<
int
>
(
ig_size
)
);
EigenVector
<
T
>::
Flatten
(
*
IG
).
device
(
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
())
=
...
...
paddle/fluid/operators/mul_op.cu.cc
浏览文件 @
e398348e
...
...
@@ -20,6 +20,7 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL
(
mul
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
MulKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
mul_grad
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
mul_grad
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
MulGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/pool_cudnn_op.cu.cc
浏览文件 @
e398348e
...
...
@@ -174,7 +174,8 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
ops
::
PoolCUDNNOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool2d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNGradOpKernel
<
float
>
,
ops
::
PoolCUDNNGradOpKernel
<
double
>
);
ops
::
PoolCUDNNGradOpKernel
<
double
>
,
ops
::
PoolCUDNNGradOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool3d
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNOpKernel
<
float
>
,
...
...
@@ -182,4 +183,5 @@ REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
ops
::
PoolCUDNNOpKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
pool3d_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
PoolCUDNNGradOpKernel
<
float
>
,
ops
::
PoolCUDNNGradOpKernel
<
double
>
);
ops
::
PoolCUDNNGradOpKernel
<
double
>
,
ops
::
PoolCUDNNGradOpKernel
<
plat
::
float16
>
);
paddle/fluid/operators/prelu_op.cc
浏览文件 @
e398348e
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -26,14 +23,40 @@ class PReluOp : public framework::OperatorWithKernel {
:
OperatorWithKernel
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
std
::
string
mode
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"mode"
);
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) should not be null"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Alpha"
),
"Input(Alpha) should not be null"
);
PADDLE_ENFORCE
(
product
(
ctx
->
GetInputDim
(
"Alpha"
))
==
1
,
"Size of weight Alpha must be one."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) should not be null"
);
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"X"
));
if
(
mode
==
"all"
)
{
PADDLE_ENFORCE
(
product
(
ctx
->
GetInputDim
(
"Alpha"
))
==
1
,
"For mode 'all', size of weight Alpha must be one."
);
}
else
if
(
mode
==
"channel"
)
{
PADDLE_ENFORCE
(
product
(
ctx
->
GetInputDim
(
"Alpha"
))
==
x_dim
[
1
],
"For channel-wise mode, size of weight Alpha must be "
"equal to the number of channels, should be %d"
,
x_dim
[
1
]);
}
else
if
(
mode
==
"element"
)
{
PADDLE_ENFORCE
(
product
(
ctx
->
GetInputDim
(
"Alpha"
))
==
product
(
x_dim
),
"For element-wise mode, size of weight Alpha must be "
"equal to the number of input, should be %d"
,
product
(
x_dim
));
}
else
{
PADDLE_THROW
(
"Unkown mode %s"
,
mode
);
}
ctx
->
SetOutputDim
(
"Out"
,
x_dim
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
platform
::
CPUPlace
());
}
};
class
PReluOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
...
...
@@ -44,9 +67,7 @@ class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
"The output tensor of prelu operator."
);
AddComment
(
R"DOC(
PRelu Operator.
The equation is:
$$
f(x) =
\begin{cases}
...
...
@@ -54,11 +75,15 @@ f(x) =
x, \qquad \text{if} \ x >= 0
\end{cases}
$$
The input `X` can carry the LoD (Level of Details) information,
or not. And the output shares the LoD information with input `X`.
There are modes:
all: all elements share same weight
channel: elements in a channel share same weight
element: each element has a weight
)DOC"
);
AddAttr
<
std
::
string
>
(
"mode"
,
"The mode for inputs to share weights."
)
.
SetDefault
(
"all"
);
}
};
...
...
@@ -71,9 +96,23 @@ class PReluGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) must not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out@GRAD) should not be null"
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Alpha"
),
ctx
->
GetInputDim
(
"Alpha"
));
auto
x_grad_name
=
framework
::
GradVarName
(
"X"
);
auto
alpha_grad_name
=
framework
::
GradVarName
(
"Alpha"
);
if
(
ctx
->
HasOutput
(
x_grad_name
))
{
ctx
->
SetOutputDim
(
x_grad_name
,
ctx
->
GetInputDim
(
"X"
));
}
if
(
ctx
->
HasOutput
(
alpha_grad_name
))
{
ctx
->
SetOutputDim
(
alpha_grad_name
,
ctx
->
GetInputDim
(
"Alpha"
));
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
platform
::
CPUPlace
());
}
};
...
...
paddle/fluid/operators/prelu_op.cu
已删除
100644 → 0
浏览文件 @
99d3f089
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/prelu_op.h"
REGISTER_OP_CUDA_KERNEL
(
prelu
,
paddle
::
operators
::
PReluKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
);
REGISTER_OP_CUDA_KERNEL
(
prelu_grad
,
paddle
::
operators
::
PReluGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
);
paddle/fluid/operators/prelu_op.h
浏览文件 @
e398348e
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
@@ -13,32 +10,16 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/transform.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
platform
::
Transform
;
template
<
typename
T
>
class
PReluFunctor
{
public:
explicit
PReluFunctor
(
const
T
*
alpha
)
:
alpha_
(
alpha
)
{}
HOSTDEVICE
T
operator
()(
const
T
&
x
)
const
{
if
(
x
>
0
)
return
x
;
else
return
x
*
(
*
alpha_
);
}
private:
const
T
*
alpha_
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
PReluKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -50,53 +31,93 @@ class PReluKernel : public framework::OpKernel<T> {
const
T
*
x_ptr
=
x
->
data
<
T
>
();
T
*
o_ptr
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
*
alpha_ptr
=
alpha
->
data
<
T
>
();
const
T
*
alpha_ptr
=
alpha
->
data
<
T
>
();
std
::
string
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
int
numel
=
x
->
numel
();
Transform
<
DeviceContext
>
trans
;
trans
(
context
.
template
device_context
<
DeviceContext
>(),
x_ptr
,
x_ptr
+
numel
,
o_ptr
,
PReluFunctor
<
T
>
(
alpha_ptr
));
}
};
template
<
typename
T
>
class
PReluGradFunctor
{
public:
explicit
PReluGradFunctor
(
const
T
*
alpha
)
:
alpha_
(
alpha
)
{}
HOSTDEVICE
T
operator
()(
const
T
&
out
,
const
T
&
dout
)
const
{
if
(
out
>
0
)
return
dout
;
else
return
dout
*
(
*
alpha_
);
auto
dim
=
x
->
dims
();
int
index
=
0
;
int
i
=
0
;
int
temp
=
0
;
if
(
mode
==
"channel"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
temp
=
numel
/
(
dim
[
0
]
*
dim
[
1
]);
index
=
(
i
/
temp
)
%
dim
[
1
];
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
index
]
*
x_ptr
[
i
];
}
}
else
if
(
mode
==
"element"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
i
]
*
x_ptr
[
i
];
}
}
else
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
0
]
*
x_ptr
[
i
];
}
}
}
private:
const
T
*
alpha_
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
PReluGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
dx
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dout
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dalpha
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Alpha"
));
auto
*
out
=
context
.
Input
<
Tensor
>
(
"Out"
);
auto
*
alpha
=
context
.
Input
<
Tensor
>
(
"Alpha"
);
auto
*
alpha_ptr
=
alpha
->
data
<
T
>
();
T
*
dx_ptr
=
dx
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
T
*
alpha_ptr
=
alpha
->
data
<
T
>
();
const
T
*
x_ptr
=
x
->
data
<
T
>
();
const
T
*
dout_ptr
=
dout
->
data
<
T
>
();
const
T
*
out_ptr
=
out
->
data
<
T
>
();
int
numel
=
dx
->
numel
();
Transform
<
DeviceContext
>
trans
;
trans
(
context
.
template
device_context
<
DeviceContext
>(),
out_ptr
,
out_ptr
+
numel
,
dout_ptr
,
dx_ptr
,
PReluGradFunctor
<
T
>
(
alpha_ptr
));
// TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
std
::
string
mode
=
context
.
Attr
<
std
::
string
>
(
"mode"
);
int
numel
=
x
->
numel
();
auto
dim
=
x
->
dims
();
int
index
=
0
;
int
i
=
0
;
int
temp
=
0
;
if
(
dx
)
{
T
*
dx_ptr
=
dx
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
mode
==
"channel"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
temp
=
numel
/
(
dim
[
0
]
*
dim
[
1
]);
index
=
(
i
/
temp
)
%
dim
[
1
];
dx_ptr
[
i
]
=
out_ptr
[
i
]
>
0
?
dout_ptr
[
i
]
:
alpha_ptr
[
index
]
*
dout_ptr
[
i
];
}
}
else
if
(
mode
==
"element"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
dx_ptr
[
i
]
=
out_ptr
[
i
]
>
0
?
dout_ptr
[
i
]
:
alpha_ptr
[
i
]
*
dout_ptr
[
i
];
}
}
else
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
dx_ptr
[
i
]
=
out_ptr
[
i
]
>
0
?
dout_ptr
[
i
]
:
alpha_ptr
[
0
]
*
dout_ptr
[
i
];
}
}
}
index
=
0
;
if
(
dalpha
)
{
T
*
dalpha_ptr
=
dalpha
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
mode
==
"channel"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
temp
=
numel
/
(
dim
[
0
]
*
dim
[
1
]);
index
=
(
i
/
temp
)
%
dim
[
1
];
dalpha_ptr
[
index
]
+=
out_ptr
[
i
]
>
0
?
0
:
x_ptr
[
i
]
*
dout_ptr
[
i
];
}
}
else
if
(
mode
==
"element"
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
dalpha_ptr
[
i
]
+=
out_ptr
[
i
]
>
0
?
0
:
x_ptr
[
i
]
*
dout_ptr
[
i
];
}
}
else
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
dalpha_ptr
[
0
]
+=
out_ptr
[
i
]
>
0
?
0
:
x_ptr
[
i
]
*
dout_ptr
[
i
];
}
}
}
// TODO(Guanzhong): add GPU kernels
}
};
...
...
paddle/fluid/operators/scale_op.cu
浏览文件 @
e398348e
...
...
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/scale_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
scale
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
int64_t
>
,
paddle
::
operators
::
ScaleKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/scatter_op.h
浏览文件 @
e398348e
...
...
@@ -35,7 +35,7 @@ class ScatterOpKernel : public framework::OpKernel<T> {
auto
*
Out
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
// In place output: Out = X, Out[Ids] += Updates
Out
->
ShareDataWith
(
*
X
);
framework
::
TensorCopySync
(
*
X
,
ctx
.
GetPlace
(),
Out
);
// Apply ScatterUpdate: Out[index] += Updates[:]
ScatterAssign
<
T
>
(
ctx
.
device_context
(),
*
Updates
,
*
Ids
,
Out
);
}
...
...
@@ -53,7 +53,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
auto
*
dOut
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
// In place gradient: dX = dO
dX
->
ShareDataWith
(
*
dOut
);
framework
::
TensorCopySync
(
*
dOut
,
ctx
.
GetPlace
(),
dX
);
dUpdates
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Gradient by Gather: dUpdates += dO[Ids]
CPUGather
<
T
>
(
ctx
.
device_context
(),
*
dOut
,
*
Ids
,
dUpdates
);
...
...
paddle/fluid/operators/softmax_cudnn_op.cu.cc
浏览文件 @
e398348e
...
...
@@ -78,4 +78,5 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
ops
::
SoftmaxCUDNNKernel
<
float
>
,
ops
::
SoftmaxCUDNNKernel
<
plat
::
float16
>
);
REGISTER_OP_KERNEL
(
softmax_grad
,
CUDNN
,
plat
::
CUDAPlace
,
ops
::
SoftmaxGradCUDNNKernel
<
float
>
);
ops
::
SoftmaxGradCUDNNKernel
<
float
>
,
ops
::
SoftmaxGradCUDNNKernel
<
plat
::
float16
>
);
paddle/fluid/operators/softmax_op.cu.cc
浏览文件 @
e398348e
...
...
@@ -23,4 +23,5 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
SoftmaxKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
softmax_grad
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
double
>
);
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
SoftmaxGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/sum_op.cu
浏览文件 @
e398348e
...
...
@@ -11,10 +11,13 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/fluid/operators/sum_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
sum
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
SumKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/sum_op.h
浏览文件 @
e398348e
...
...
@@ -46,7 +46,7 @@ class SumKernel : public framework::OpKernel<T> {
if
(
!
in_place
)
{
math
::
SetConstant
<
DeviceContext
,
T
>
constant_functor
;
constant_functor
(
context
.
template
device_context
<
DeviceContext
>(),
out
,
0.0
);
static_cast
<
T
>
(
0
)
);
}
math
::
SelectedRowsAddToTensor
<
DeviceContext
,
T
>
functor
;
...
...
paddle/fluid/operators/top_k_op.cu
浏览文件 @
e398348e
...
...
@@ -11,16 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <limits>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/top_k_op.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
paddle
::
platform
::
float16
;
template
<
typename
T
>
struct
Pair
{
...
...
@@ -32,6 +35,11 @@ struct Pair {
id
=
id
;
}
__device__
__forceinline__
void
clear
()
{
v
=
-
INFINITY
;
id
=
-
1
;
}
__device__
__forceinline__
void
operator
=
(
const
Pair
<
T
>&
in
)
{
v
=
in
.
v
;
id
=
in
.
id
;
...
...
@@ -53,6 +61,12 @@ struct Pair {
int64_t
id
;
};
template
<
>
__device__
__forceinline__
void
Pair
<
float16
>::
clear
()
{
v
=
platform
::
raw_uint16_to_float16
(
0x400
);
id
=
-
1
;
}
template
<
typename
T
>
__device__
__forceinline__
void
AddTo
(
Pair
<
T
>
topk
[],
const
Pair
<
T
>&
p
,
int
beam_size
)
{
...
...
@@ -150,7 +164,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if
(
k
<
MaxLength
-
(
*
beam
))
{
topk
[
k
]
=
topk
[
k
+
*
beam
];
}
else
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
topk
[
k
].
clear
(
);
}
}
if
(
!
(
*
is_empty
))
{
...
...
@@ -160,7 +174,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
}
*
max
=
topk
[
MaxLength
-
1
];
if
((
*
max
).
v
==
-
1
)
*
is_empty
=
true
;
if
((
*
max
).
v
==
static_cast
<
T
>
(
-
1
)
)
*
is_empty
=
true
;
*
beam
=
0
;
}
}
...
...
@@ -181,7 +195,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
if
(
k
<
MaxLength
-
*
beam
)
{
topk
[
k
]
=
topk
[
k
+
*
beam
];
}
else
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
topk
[
k
].
set
(
std
::
numeric_limits
<
T
>::
min
()
,
-
1
);
}
}
if
(
!
(
*
is_empty
))
{
...
...
@@ -273,7 +287,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
bool
firststep
=
true
;
for
(
int
k
=
0
;
k
<
MaxLength
;
k
++
)
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
topk
[
k
].
clear
(
);
}
while
(
k
)
{
ThreadGetTopK
<
T
,
MaxLength
,
BlockSize
>
(
topk
,
&
beam
,
k
,
...
...
@@ -325,5 +339,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
}
// namespace operators
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
top_k
,
paddle
::
operators
::
TopkOpCUDAKernel
<
float
>
,
paddle
::
operators
::
TopkOpCUDAKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
top_k
,
paddle
::
operators
::
TopkOpCUDAKernel
<
float
>
,
paddle
::
operators
::
TopkOpCUDAKernel
<
double
>
,
paddle
::
operators
::
TopkOpCUDAKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/uniform_random_op.cu
浏览文件 @
e398348e
...
...
@@ -11,10 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <thrust/random.h>
#include <thrust/transform.h>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/transform.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -36,6 +40,11 @@ struct UniformGenerator {
}
};
template
<
typename
T
,
typename
V
>
struct
CastFunctor
{
HOSTDEVICE
V
operator
()(
const
T
&
a
)
{
return
static_cast
<
V
>
(
a
);
}
};
// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
// Use std::random and thrust::random(thrust is a std library in CUDA) to
// implement uniform random.
...
...
@@ -66,18 +75,50 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
T
max
=
static_cast
<
T
>
(
context
.
Attr
<
float
>
(
"max"
));
thrust
::
counting_iterator
<
unsigned
int
>
index_sequence_begin
(
0
);
int64_t
size
=
tensor
->
numel
();
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
T
>
(
data
),
UniformGenerator
<
T
>
(
min
,
max
,
seed
));
if
(
out_var
->
IsType
<
framework
::
LoDTensor
>
()
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
framework
::
Tensor
master_copy_tensor
;
master_copy_tensor
.
Resize
(
tensor
->
dims
());
float
*
master_copy_tensor_data
=
master_copy_tensor
.
mutable_data
<
float
>
(
context
.
GetPlace
());
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
float
>
(
master_copy_tensor_data
),
UniformGenerator
<
float
>
(
static_cast
<
float
>
(
min
),
static_cast
<
float
>
(
max
),
seed
));
platform
::
Transform
<
platform
::
CUDADeviceContext
>
trans
;
auto
*
in_begin
=
master_copy_tensor
.
data
<
float
>
();
auto
*
in_end
=
in_begin
+
master_copy_tensor
.
numel
();
auto
*
out_begin
=
tensor
->
mutable_data
<
T
>
(
context
.
GetPlace
());
trans
(
context
.
template
device_context
<
platform
::
CUDADeviceContext
>(),
in_begin
,
in_end
,
out_begin
,
CastFunctor
<
float
,
T
>
());
}
else
{
thrust
::
transform
(
index_sequence_begin
,
index_sequence_begin
+
size
,
thrust
::
device_ptr
<
T
>
(
data
),
UniformGenerator
<
T
>
(
min
,
max
,
seed
));
}
if
(
VLOG_IS_ON
(
5
))
{
framework
::
Tensor
cpu_tensor
;
framework
::
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
cpu_tensor
);
auto
&
dev_ctx
=
*
platform
::
DeviceContextPool
::
Instance
().
Get
(
context
.
GetPlace
());
dev_ctx
.
Wait
();
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
cpu_tensor
);
VLOG
(
5
)
<<
"The Uniform output "
<<
x
;
}
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
uniform_random
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
);
REGISTER_OP_CUDA_KERNEL
(
uniform_random_batch_size_like
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
);
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
uniform_random
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
uniform_random_batch_size_like
,
paddle
::
operators
::
GPUUniformRandomKernel
<
float
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
double
>
,
paddle
::
operators
::
GPUUniformRandomKernel
<
plat
::
float16
>
);
paddle/fluid/platform/dynload/mklml.h
浏览文件 @
e398348e
...
...
@@ -60,6 +60,14 @@ extern void* mklml_dso_handle;
__macro(cblas_dgemm_batch); \
__macro(vsAdd); \
__macro(vdAdd); \
__macro(cblas_sgemm_alloc); \
__macro(cblas_sgemm_pack); \
__macro(cblas_sgemm_compute); \
__macro(cblas_sgemm_free); \
__macro(cblas_dgemm_alloc); \
__macro(cblas_dgemm_pack); \
__macro(cblas_dgemm_compute); \
__macro(cblas_dgemm_free); \
__macro(MKL_Set_Num_Threads)
MKLML_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_MKLML_WRAP
);
...
...
paddle/fluid/platform/enforce.h
浏览文件 @
e398348e
...
...
@@ -263,7 +263,8 @@ inline void throw_on_error(T e) {
* PADDLE_ENFORCE_EQ(a, b);
*
* will raise an expression described as follows:
* "enforce a == b failed, 1 != 2" with detailed stack information.
* "Enforce failed. Expected input a == b, but received a(1) != b(2)."
* with detailed stack information.
*
* extra messages is also supported, for example:
* PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
...
...
@@ -292,9 +293,10 @@ inline void throw_on_error(T e) {
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
do { \
if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) { \
PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP \
" %s\n%s", \
#__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
PADDLE_THROW("Enforce failed. Expected %s " #__CMP \
" %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
#__VAL0, #__VAL1, #__VAL0, \
paddle::string::to_string(__VAL0), #__VAL1, \
paddle::string::to_string(__VAL1), \
paddle::string::Sprintf("" __VA_ARGS__)); \
} \
...
...
paddle/fluid/platform/enforce_test.cc
浏览文件 @
e398348e
...
...
@@ -54,7 +54,9 @@ TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
PADDLE_ENFORCE_EQ
(
a
,
1
+
3
);
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
HasPrefix
(
StringPiece
(
error
.
what
()),
"enforce a == 1 + 3 failed, 2 != 4"
);
HasPrefix
(
StringPiece
(
error
.
what
()),
"Enforce failed. Expected a == 1 + 3, but received a:2 != 1 + 3:4."
);
}
EXPECT_TRUE
(
caught_exception
);
}
...
...
@@ -67,7 +69,8 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
HasPrefix
(
StringPiece
(
error
.
what
()),
"enforce a == 1 + 3 failed, 2 != 4
\n
their size not match"
);
"Enforce failed. Expected a == 1 + 3, but received a:2 != 1 + "
"3:4.
\n
their size not match"
);
}
EXPECT_TRUE
(
caught_exception
);
}
...
...
@@ -84,8 +87,9 @@ TEST(ENFORCE_NE, FAIL) {
PADDLE_ENFORCE_NE
(
1.0
,
1UL
);
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
EXPECT_TRUE
(
HasPrefix
(
StringPiece
(
error
.
what
()),
"enforce 1.0 != 1UL failed, 1 == 1"
))
EXPECT_TRUE
(
HasPrefix
(
StringPiece
(
error
.
what
()),
"Enforce failed. Expected 1.0 != 1UL, but received 1.0:1 == 1UL:1."
))
<<
error
.
what
()
<<
" does not have expected prefix"
;
}
EXPECT_TRUE
(
caught_exception
);
...
...
@@ -98,8 +102,9 @@ TEST(ENFORCE_GT, FAIL) {
PADDLE_ENFORCE_GT
(
1
,
2UL
);
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
EXPECT_TRUE
(
HasPrefix
(
StringPiece
(
error
.
what
()),
"enforce 1 > 2UL failed, 1 <= 2"
));
EXPECT_TRUE
(
HasPrefix
(
StringPiece
(
error
.
what
()),
"Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."
));
}
EXPECT_TRUE
(
caught_exception
);
}
...
...
@@ -116,8 +121,9 @@ TEST(ENFORCE_GE, FAIL) {
PADDLE_ENFORCE_GE
(
1
,
2UL
);
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
EXPECT_TRUE
(
HasPrefix
(
StringPiece
(
error
.
what
()),
"enforce 1 >= 2UL failed, 1 < 2"
));
EXPECT_TRUE
(
HasPrefix
(
StringPiece
(
error
.
what
()),
"Enforce failed. Expected 1 >= 2UL, but received 1:1 < 2UL:2."
));
}
EXPECT_TRUE
(
caught_exception
);
}
...
...
@@ -135,8 +141,9 @@ TEST(ENFORCE_LE, FAIL) {
PADDLE_ENFORCE_GT
(
1
,
2UL
);
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
EXPECT_TRUE
(
HasPrefix
(
StringPiece
(
error
.
what
()),
"enforce 1 > 2UL failed, 1 <= 2"
));
EXPECT_TRUE
(
HasPrefix
(
StringPiece
(
error
.
what
()),
"Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."
));
}
EXPECT_TRUE
(
caught_exception
);
}
...
...
@@ -153,7 +160,8 @@ TEST(ENFORCE_LT, FAIL) {
}
catch
(
paddle
::
platform
::
EnforceNotMet
error
)
{
caught_exception
=
true
;
EXPECT_TRUE
(
HasPrefix
(
StringPiece
(
error
.
what
()),
"enforce 1UL < 0.12 failed, 1 >= 0.12"
));
"Enforce failed. Expected 1UL < 0.12, but "
"received 1UL:1 >= 0.12:0.12."
));
}
EXPECT_TRUE
(
caught_exception
);
}
...
...
paddle/fluid/platform/gpu_info.cc
浏览文件 @
e398348e
...
...
@@ -116,7 +116,8 @@ size_t GpuMaxChunkSize() {
size_t
allocating
=
static_cast
<
size_t
>
(
FLAGS_fraction_of_gpu_memory_to_use
*
(
total
-
reserving
));
PADDLE_ENFORCE_LE
(
allocating
,
available
);
PADDLE_ENFORCE_LE
(
allocating
,
available
,
"Insufficient GPU memory to allocation."
);
return
allocating
;
}
...
...
python/CMakeLists.txt
浏览文件 @
e398348e
...
...
@@ -97,10 +97,11 @@ if(APPLE)
if
(
NOT INSTALL_NAME_TOOL_EXECUTABLE
)
message
(
FATAL_ERROR
"install_name_tool not found, please check.
\n
"
)
endif
()
else
(
APPLE
)
endif
()
if
(
LINUX
)
find_program
(
PATCHELF_EXECUTABLE patchelf
)
if
(
NOT PATCHELF_EXECUTABLE
)
message
(
FATAL_ERROR
"patchelf not found, please install it.
\n
"
"For Ubuntu, the command is: apt-get install -y patchelf."
)
endif
()
endif
(
APPLE
)
endif
(
LINUX
)
python/paddle/fluid/__init__.py
浏览文件 @
e398348e
...
...
@@ -122,7 +122,7 @@ def __bootstrap__():
'use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'warpctc_dir'
,
'eager_delete_scope'
,
'use_mkldnn'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem'
,
'free_idle_memory'
,
'paddle_num_threads'
,
'cpu_deterministic'
"dist_threadpool_size"
,
'cpu_deterministic'
]
if
core
.
is_compiled_with_dist
():
read_env_flags
.
append
(
'rpc_deadline'
)
...
...
python/paddle/fluid/initializer.py
浏览文件 @
e398348e
...
...
@@ -17,7 +17,6 @@ from __future__ import print_function
from
.
import
framework
import
numpy
as
np
import
contextlib
from
.framework
import
convert_np_dtype_to_dtype_
from
.core
import
VarDesc
__all__
=
[
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
e398348e
...
...
@@ -100,6 +100,7 @@ __all__ = [
'log'
,
'crop'
,
'rank_loss'
,
'prelu'
,
'flatten'
,
]
...
...
@@ -5077,7 +5078,7 @@ def random_crop(x, shape, seed=None):
return
out
def
log
(
x
):
def
log
(
x
,
name
=
None
):
"""
Calculates the natural log of the given input tensor, element-wise.
...
...
@@ -5087,6 +5088,8 @@ def log(x):
Args:
x (Variable): Input tensor.
name (str|None, default None): A name for this layer If set None,
the layer will be named automatically.
Returns:
Variable: The natural log of the input tensor computed element-wise.
...
...
@@ -5104,7 +5107,7 @@ def log(x):
return
out
def
relu
(
x
):
def
relu
(
x
,
name
=
None
):
"""
Relu takes one input data (Tensor) and produces one output data (Tensor)
where the rectified linear function, y = max(0, x), is applied to
...
...
@@ -5116,6 +5119,8 @@ def relu(x):
Args:
x (Variable): The input tensor.
name (str|None, default None): A name for this layer If set None,
the layer will be named automatically.
Returns:
Variable: The output tensor with the same shape as input.
...
...
@@ -5352,6 +5357,59 @@ def rank_loss(label, left, right, name=None):
return
out
def
prelu
(
x
,
mode
,
param_attr
=
None
,
name
=
None
):
"""
Equation:
y = \max(0, x) + alpha \min(0, x)
Args:
x (Variable): The input tensor.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight (alpha).
mode (string): The mode for weight sharing
all: all elements share same weight
channel:elements in a channel share same weight
element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The output tensor with the same shape as input.
Examples:
.. code-block:: python
x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
mode = 'channel'
output = fluid.layers.prelu(x,mode)
"""
helper
=
LayerHelper
(
'prelu'
,
**
locals
())
if
mode
not
in
[
'all'
,
'channel'
,
'element'
]:
raise
ValueError
(
'mode should be one of all, channel, element.'
)
alpha_shape
=
[
1
]
if
mode
==
'channel'
:
alpha_shape
=
[
1
,
x
.
shape
[
1
],
1
,
1
]
elif
mode
==
'element'
:
alpha_shape
=
x
.
shape
dtype
=
helper
.
input_dtype
(
input_param_name
=
'x'
)
alpha
=
helper
.
create_parameter
(
attr
=
param_attr
,
shape
=
alpha_shape
,
dtype
=
'float32'
,
is_bias
=
False
,
default_initializer
=
Constant
(
1.0
))
out
=
helper
.
create_tmp_variable
(
dtype
)
helper
.
append_op
(
type
=
"prelu"
,
inputs
=
{
"X"
:
x
,
'Alpha'
:
alpha
},
attrs
=
{
"mode"
:
mode
},
outputs
=
{
"Out"
:
out
})
return
out
def
flatten
(
x
,
axis
=
1
,
name
=
None
):
"""
**Flatten layer**
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
e398348e
...
...
@@ -59,8 +59,8 @@ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=$
if
(
WITH_DISTRIBUTE
)
py_test_modules
(
test_dist_train MODULES test_dist_train SERIAL
)
set_tests_properties
(
test_listen_and_serv_op PROPERTIES TIMEOUT 20
)
set_tests_properties
(
test_dist_mnist PROPERTIES TIMEOUT
18
0
)
set_tests_properties
(
test_dist_word2vec PROPERTIES TIMEOUT
18
0
)
set_tests_properties
(
test_dist_mnist PROPERTIES TIMEOUT
20
0
)
set_tests_properties
(
test_dist_word2vec PROPERTIES TIMEOUT
20
0
)
endif
()
py_test_modules
(
test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL
)
py_test_modules
(
test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL
)
...
...
python/paddle/fluid/tests/unittests/test_dist_train.py
浏览文件 @
e398348e
...
...
@@ -28,6 +28,12 @@ from paddle.fluid.layers.io import ListenAndServ
from
paddle.fluid.layers.io
import
Recv
from
paddle.fluid.layers.io
import
Send
from
paddle.fluid
import
core
RPC_OP_ROLE_ATTR_NAME
=
op_role_attr_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
(
)
RPC_OP_ROLE_ATTR_VALUE
=
core
.
op_proto_and_checker_maker
.
OpRole
.
RPC
class
TestSendOp
(
unittest
.
TestCase
):
def
test_send
(
self
):
...
...
@@ -91,18 +97,29 @@ class TestSendOp(unittest.TestCase):
def
init_client
(
self
,
place
,
port
):
main
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
):
main
.
global_block
().
append_op
(
type
=
"fetch_barrier"
,
inputs
=
{},
outputs
=
{},
attrs
=
{
"endpoints"
:
[
"127.0.0.1:{0}"
.
format
(
port
)],
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
x
=
layers
.
data
(
shape
=
[
32
,
32
],
dtype
=
'float32'
,
name
=
'X'
,
append_batch_size
=
False
)
fluid
.
initializer
.
Constant
(
value
=
2.3
)(
x
,
main
.
global_block
())
get_var
=
main
.
global_block
().
create_var
(
name
=
"scale_0.tmp_0"
,
# server side var
dtype
=
"float32"
,
persistable
=
False
,
shape
=
[
32
,
32
])
fluid
.
initializer
.
Constant
(
value
=
2.3
)(
get_var
,
main
.
global_block
())
Send
(
"127.0.0.1:%d"
%
port
,
[
x
])
o
=
Recv
(
"127.0.0.1:%d"
%
port
,
[
get_var
])
...
...
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
浏览文件 @
e398348e
...
...
@@ -20,6 +20,7 @@ import unittest
import
paddle.fluid
as
fluid
from
paddle.fluid.transpiler.distribute_transpiler
import
delete_ops
import
traceback
import
collections
class
TranspilerTest
(
unittest
.
TestCase
):
...
...
@@ -56,9 +57,18 @@ class TranspilerTest(unittest.TestCase):
self
.
origin_prog
=
main
.
clone
()
return
main
def
get_trainer
(
self
,
config
=
None
,
sync_mode
=
True
):
t
=
self
.
_transpiler_instance
(
config
,
sync_mode
)
return
t
.
get_trainer_program
()
def
get_trainer
(
self
,
config
=
None
):
src
=
fluid
.
default_startup_program
().
clone
()
t
=
self
.
_transpiler_instance
(
config
)
trainer_main
=
t
.
get_trainer_program
()
trainer_startup
=
fluid
.
default_startup_program
()
assert
(
src
.
num_blocks
==
1
)
assert
(
trainer_startup
.
num_blocks
==
src
.
num_blocks
)
return
trainer_main
,
trainer_startup
def
get_pserver
(
self
,
ep
,
config
=
None
,
sync_mode
=
True
):
t
=
self
.
_transpiler_instance
(
config
,
sync_mode
)
...
...
@@ -94,7 +104,21 @@ class TestBasicModel(TranspilerTest):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
pserver2
,
startup2
=
self
.
get_pserver
(
self
.
pserver2_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
trainer_startup
=
self
.
get_trainer
()
# splited var blocks should be in startup program
self
.
assertTrue
(
"fc_w.block0"
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_w.block1"
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_w"
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_b"
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_w@GRAD"
not
in
trainer_startup
.
global_block
().
vars
)
self
.
assertTrue
(
"fc_b@GRAD"
not
in
trainer_startup
.
global_block
().
vars
)
src
=
[
op
.
type
for
op
in
trainer_startup
.
global_block
().
ops
]
dst
=
[
'fill_constant'
,
'fill_constant'
,
'uniform_random'
,
'recv'
,
'recv'
,
\
'fetch_barrier'
,
'concat'
]
self
.
assertEqual
(
src
,
dst
)
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
global_block
().
ops
],
[
'mul'
,
'elementwise_add'
,
'elementwise_sub'
,
'square'
,
'mean'
,
...
...
@@ -145,7 +169,7 @@ class TestBasicModelWithLargeBlockSize(TranspilerTest):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
,
config
)
pserver2
,
startup2
=
self
.
get_pserver
(
self
.
pserver2_ep
,
config
)
trainer
=
self
.
get_trainer
(
config
)
trainer
,
_
=
self
.
get_trainer
(
config
)
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
global_block
().
ops
],
[
'mul'
,
'elementwise_add'
,
'elementwise_sub'
,
'square'
,
'mean'
,
...
...
@@ -229,7 +253,7 @@ class TestLRDecay(TranspilerTest):
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
pserver
.
blocks
),
4
)
lr_decay_ops
=
[
op
.
type
for
op
in
pserver
.
blocks
[
1
].
ops
]
...
...
@@ -259,7 +283,7 @@ class TestLRDecayConditional(TranspilerTest):
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
serv_op
=
pserver
.
blocks
[
0
].
ops
[
0
]
sub_blocks
=
[]
...
...
@@ -308,7 +332,7 @@ class TestL2Decay(TranspilerTest):
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
pserver
.
blocks
),
3
)
self
.
assertEqual
([
op
.
type
for
op
in
pserver
.
blocks
[
1
].
ops
],
...
...
@@ -343,7 +367,7 @@ class TestL2DecayWithPiecewise(TranspilerTest):
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
pserver
.
blocks
),
9
)
self
.
assertEqual
([
op
.
type
for
op
in
pserver
.
blocks
[
1
].
ops
],
[
...
...
@@ -418,7 +442,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"sum"
,
"adam"
,
"scale"
,
"scale"
])
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'lookup_table'
,
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
...
...
@@ -456,7 +480,7 @@ class TestDistLookupTable(TestDistLookupTableBase):
# 5 save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
5
].
ops
],
[
"save"
])
trainer
=
self
.
get_trainer
()
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'split_ids'
,
'prefetch'
,
'merge_ids'
,
'sequence_pool'
,
'split_ids'
,
...
...
@@ -489,7 +513,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"adam"
,
"scale"
,
"scale"
])
trainer
=
self
.
get_trainer
(
config
)
trainer
,
_
=
self
.
get_trainer
(
config
)
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'lookup_table'
,
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
...
...
@@ -528,7 +552,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
# 5 save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
5
].
ops
],
[
"save"
])
trainer
=
self
.
get_trainer
(
config
)
trainer
,
_
=
self
.
get_trainer
(
config
)
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'split_ids'
,
'prefetch'
,
'merge_ids'
,
'sequence_pool'
,
'split_ids'
,
...
...
python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
浏览文件 @
e398348e
...
...
@@ -24,6 +24,7 @@ def fully_connected_naive(input, weights, bias_data=None):
w_h
,
w_c
=
weights
.
shape
x_data
=
np
.
reshape
(
input
,
[
in_n
,
in_c
*
in_h
*
in_w
])
# this transpose should be implemented at C code
w_data
=
np
.
transpose
(
np
.
reshape
(
weights
,
(
w_c
,
in_c
*
in_h
*
in_w
)))
result
=
None
...
...
@@ -45,15 +46,11 @@ class TestFCMKLDNNOp(OpTest):
def
setUp
(
self
):
self
.
op_type
=
"fc"
self
.
use_mkldnn
=
True
self
.
with_bias
=
True
self
.
matrix
=
MatrixGenerate
(
1
,
10
,
15
,
3
,
3
)
self
.
inputs
=
{
'Input'
:
self
.
matrix
.
input
,
'W'
:
self
.
matrix
.
weights
}
self
.
attrs
=
{
'use_mkldnn'
:
self
.
use_mkldnn
,
'with_bias'
:
self
.
with_bias
}
self
.
attrs
=
{
'use_mkldnn'
:
self
.
use_mkldnn
,
}
self
.
outputs
=
{
'Out'
:
fully_connected_naive
(
self
.
matrix
.
input
,
self
.
matrix
.
weights
)
...
...
@@ -87,13 +84,11 @@ class TestFCMKLDNNOp3(TestFCMKLDNNOp):
class
TestFCMKLDNNOp4
(
TestFCMKLDNNOp
):
def
init_op_type
(
self
):
self
.
with_bias
=
False
self
.
matrix
=
MatrixGenerate
(
2
,
32
,
48
,
2
,
2
)
class
TestFCMKLDNNOp4
(
TestFCMKLDNNOp
):
def
init_op_type
(
self
):
self
.
with_bias
=
False
self
.
matrix
=
MatrixGenerate
(
2
,
32
,
1000
,
6
,
6
)
...
...
python/paddle/fluid/tests/unittests/test_fc_op.py
0 → 100644
浏览文件 @
e398348e
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
def
fc_refer
(
matrix
,
with_bias
):
in_n
,
in_c
,
in_h
,
in_w
=
matrix
.
input
.
shape
w_i
,
w_o
=
matrix
.
weights
.
shape
x_data
=
np
.
reshape
(
matrix
.
input
,
[
in_n
,
in_c
*
in_h
*
in_w
])
w_data
=
np
.
reshape
(
matrix
.
weights
,
[
w_i
,
w_o
])
b_data
=
np
.
reshape
(
matrix
.
bias
,
[
1
,
w_o
])
result
=
None
if
with_bias
:
result
=
np
.
dot
(
x_data
,
w_data
)
+
b_data
else
:
result
=
np
.
dot
(
x_data
,
w_data
)
return
result
class
MatrixGenerate
:
def
__init__
(
self
,
mb
,
ic
,
oc
,
h
,
w
):
self
.
input
=
np
.
random
.
random
((
mb
,
ic
,
h
,
w
)).
astype
(
"float32"
)
self
.
weights
=
np
.
random
.
random
((
ic
*
h
*
w
,
oc
)).
astype
(
"float32"
)
self
.
bias
=
np
.
random
.
random
((
1
,
oc
)).
astype
(
"float32"
)
class
TestFCOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"fc"
self
.
matrix
=
MatrixGenerate
(
1
,
10
,
15
,
3
,
3
)
self
.
with_bias
=
True
if
self
.
with_bias
:
self
.
inputs
=
{
'Input'
:
self
.
matrix
.
input
,
'W'
:
self
.
matrix
.
weights
,
'Bias'
:
self
.
matrix
.
bias
}
else
:
self
.
inputs
=
{
'Input'
:
self
.
matrix
.
input
,
'W'
:
self
.
matrix
.
weights
}
self
.
attrs
=
{
'use_mkldnn'
:
False
}
self
.
outputs
=
{
'Out'
:
fc_refer
(
self
.
matrix
,
self
.
with_bias
)}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestFCOpBiasBoth
(
TestFCOp
):
def
init_shapes
(
self
,
mb
,
ic
,
oc
,
h
,
w
):
for
with_bias
in
{
True
,
False
}:
self
.
with_bias
=
with_bias
self
.
matrix
=
MatrixGenerate
(
mb
,
ic
,
oc
,
h
,
w
)
class
TestFCOp1
(
TestFCOpBiasBoth
):
def
init_op_type
(
self
):
self
.
init_shapes
(
2
,
8
,
10
,
1
,
1
)
class
TestFCOp2
(
TestFCOpBiasBoth
):
def
init_op_type
(
self
):
self
.
init_shapes
(
4
,
5
,
6
,
2
,
2
)
class
TestFCOp4
(
TestFCOpBiasBoth
):
def
init_op_type
(
self
):
self
.
init_shapes
(
1
,
32
,
64
,
3
,
3
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
e398348e
...
...
@@ -21,6 +21,7 @@ import paddle.fluid.nets as nets
from
paddle.fluid.framework
import
Program
,
program_guard
,
default_main_program
from
paddle.fluid.param_attr
import
ParamAttr
import
decorators
from
paddle.fluid.initializer
import
Constant
class
TestBook
(
unittest
.
TestCase
):
...
...
@@ -485,6 +486,20 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
def
test_prelu
(
self
):
program
=
Program
()
with
program_guard
(
program
):
input
=
layers
.
data
(
name
=
"input"
,
shape
=
[
5
,
200
,
100
,
100
],
dtype
=
"float32"
)
mode
=
'channel'
out
=
layers
.
prelu
(
input
,
mode
,
param_attr
=
ParamAttr
(
initializer
=
Constant
(
1.0
)),
name
=
'prelu'
)
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_prelu_op.py
浏览文件 @
e398348e
...
...
@@ -22,30 +22,58 @@ from op_test import OpTest
class
PReluTest
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"prelu"
x_np
=
np
.
random
.
normal
(
size
=
(
10
,
10
)).
astype
(
"float32"
)
for
pos
,
val
in
np
.
ndenumerate
(
x_np
):
# Since zero point in prelu is not differentiable, avoid randomize
# zero.
while
abs
(
val
)
<
1e-3
:
x_np
[
pos
]
=
np
.
random
.
normal
()
val
=
x_np
[
pos
]
x_np_sign
=
np
.
sign
(
x_np
)
x_np
=
x_np_sign
*
np
.
maximum
(
x_np
,
.
005
)
alpha_np
=
np
.
array
([.
1
],
dtype
=
"float32"
)
self
.
inputs
=
{
'X'
:
x_np
,
'Alpha'
:
alpha_np
}
self
.
initTestCase
()
x_np
=
np
.
random
.
normal
(
size
=
(
3
,
5
,
5
,
10
)).
astype
(
"float32"
)
# Since zero point in prelu is not differentiable, avoid randomize
# zero.
x_np
[
np
.
abs
(
x_np
)
<
0.005
]
=
0.02
if
self
.
attrs
==
{
'mode'
:
"all"
}:
alpha_np
=
np
.
random
.
rand
(
1
).
astype
(
"float32"
)
self
.
inputs
=
{
'X'
:
x_np
,
'Alpha'
:
alpha_np
}
elif
self
.
attrs
==
{
'mode'
:
"channel"
}:
alpha_np
=
np
.
random
.
rand
(
1
,
x_np
.
shape
[
1
],
1
,
1
).
astype
(
"float32"
)
self
.
inputs
=
{
'X'
:
x_np
,
'Alpha'
:
alpha_np
}
else
:
alpha_np
=
np
.
random
.
rand
(
*
x_np
.
shape
).
astype
(
"float32"
)
self
.
inputs
=
{
'X'
:
x_np
,
'Alpha'
:
alpha_np
}
out_np
=
np
.
maximum
(
self
.
inputs
[
'X'
],
0.
)
out_np
=
out_np
+
np
.
minimum
(
self
.
inputs
[
'X'
],
0.
)
*
self
.
inputs
[
'Alpha'
]
assert
out_np
is
not
self
.
inputs
[
'X'
]
self
.
outputs
=
{
'Out'
:
out_np
}
def
initTestCase
(
self
):
self
.
attrs
=
{
'mode'
:
"channel"
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
self
.
check_grad
([
'X'
,
'Alpha'
],
'Out'
)
def
test_check_grad_ignore_x
(
self
):
self
.
check_grad
([
'Alpha'
],
'Out'
,
no_grad_set
=
set
(
'X'
))
def
test_check_grad_ignore_alpha
(
self
):
self
.
check_grad
([
'X'
],
'Out'
,
no_grad_set
=
set
(
'Alpha'
))
class
TestCase1
(
PReluTest
):
def
initTestCase
(
self
):
self
.
attrs
=
{
'mode'
:
"all"
}
class
TestCase2
(
PReluTest
):
def
initTestCase
(
self
):
self
.
attrs
=
{
'mode'
:
"channel"
}
class
TestCase3
(
PReluTest
):
def
initTestCase
(
self
):
self
.
attrs
=
{
'mode'
:
"element"
}
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
e398348e
...
...
@@ -198,6 +198,9 @@ class DistributeTranspiler(object):
if
program
is
None
:
program
=
default_main_program
()
self
.
origin_program
=
program
self
.
origin_startup_program
=
default_startup_program
().
clone
()
self
.
startup_program
=
default_startup_program
()
self
.
trainer_num
=
trainers
self
.
sync_mode
=
sync_mode
self
.
trainer_id
=
trainer_id
...
...
@@ -208,10 +211,10 @@ class DistributeTranspiler(object):
ps_dispatcher
=
self
.
config
.
split_method
(
self
.
pserver_endpoints
)
self
.
has_distributed_lookup_table
=
self
.
_has_distributed_lookup_table
()
# split and create vars, then put splited vars in dicts for later use.
# s
tep 1: s
plit and create vars, then put splited vars in dicts for later use.
self
.
_init_splited_vars
()
# step
3.1
: insert send op to send gradient vars to parameter servers
# step
2
: insert send op to send gradient vars to parameter servers
ps_dispatcher
.
reset
()
send_vars
=
[]
...
...
@@ -269,7 +272,7 @@ class DistributeTranspiler(object):
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
# step 3
.2
: insert recv op to receive parameters from parameter server
# step 3: insert recv op to receive parameters from parameter server
recv_vars
=
[]
for
_
,
var
in
enumerate
(
send_vars
):
recv_vars
.
append
(
self
.
grad_param_mapping
[
var
])
...
...
@@ -316,6 +319,8 @@ class DistributeTranspiler(object):
outputs
=
{
"Out"
:
[
orig_param
]},
attrs
=
{
"axis"
:
0
})
self
.
_get_trainer_startup_program
(
recv_vars
=
recv_vars
,
eplist
=
eplist
)
if
self
.
has_distributed_lookup_table
:
self
.
_replace_lookup_table_op_with_prefetch
(
program
,
pserver_endpoints
)
...
...
@@ -332,8 +337,78 @@ class DistributeTranspiler(object):
# FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
delete_ops
(
self
.
origin_program
.
global_block
(),
self
.
optimize_ops
)
self
.
origin_program
.
__str__
()
return
self
.
origin_program
def
_get_trainer_startup_program
(
self
,
recv_vars
,
eplist
,
startup_program
=
None
):
"""
Get transpiled trainer side startup program.
Args:
startup_program(Program): Startup program.
Returns:
Program: trainer side startup program.
"""
if
startup_program
is
None
:
startup_program
=
self
.
startup_program
# FIXME(gongwb): delete not need ops.
# note that: some parameter is not trainable and those ops can't be deleted.
for
varname
,
splited_var
in
self
.
param_var_mapping
.
iteritems
():
# Get the eplist of recv vars
eps
=
[]
for
var
in
splited_var
:
index
=
[
v
.
name
for
v
in
recv_vars
].
index
(
var
.
name
)
eps
.
append
(
eplist
[
index
])
for
var
in
splited_var
:
if
startup_program
.
global_block
().
has_var
(
var
.
name
):
continue
startup_program
.
global_block
().
create_var
(
name
=
var
.
name
,
persistable
=
False
,
type
=
var
.
type
,
dtype
=
var
.
dtype
,
shape
=
var
.
shape
,
lod_level
=
var
.
lod_level
)
op
=
startup_program
.
global_block
().
append_op
(
type
=
"recv"
,
inputs
=
{},
outputs
=
{
"Out"
:
splited_var
},
attrs
=
{
"epmap"
:
eps
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
startup_program
.
global_block
().
append_op
(
type
=
"fetch_barrier"
,
inputs
=
{},
outputs
=
{},
attrs
=
{
"endpoints"
:
self
.
pserver_endpoints
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
})
for
varname
,
splited_var
in
self
.
param_var_mapping
.
iteritems
():
#add concat ops to merge splited parameters received from parameter servers.
if
len
(
splited_var
)
<=
1
:
continue
orig_param
=
startup_program
.
global_block
().
vars
[
varname
]
startup_program
.
global_block
().
append_op
(
type
=
"concat"
,
inputs
=
{
"X"
:
splited_var
},
outputs
=
{
"Out"
:
[
orig_param
]},
attrs
=
{
"axis"
:
0
})
return
startup_program
def
get_pserver_program
(
self
,
endpoint
):
"""
Get parameter server side program.
...
...
@@ -580,14 +655,16 @@ class DistributeTranspiler(object):
new_outputs
=
collections
.
OrderedDict
()
# do not append startup op if var is not on this pserver
op_on_pserver
=
False
for
key
in
op
.
output_names
:
newname
,
_
=
_get_splited_name_and_shape
(
op
.
output
(
key
)[
0
])
if
newname
:
op_on_pserver
=
True
new_outputs
[
key
]
=
created_var_map
[
newname
]
elif
op
.
output
(
key
)[
0
]
in
pserver_vars
:
op_on_pserver
=
True
new_outputs
[
key
]
=
pserver_vars
[
op
.
output
(
key
)[
0
]]
# TODO(gongwb): remove this line.
if
op
.
type
not
in
[
"recv"
,
"fetch_barrier"
,
"concat"
]:
for
key
in
op
.
output_names
:
newname
,
_
=
_get_splited_name_and_shape
(
op
.
output
(
key
)[
0
])
if
newname
:
op_on_pserver
=
True
new_outputs
[
key
]
=
created_var_map
[
newname
]
elif
op
.
output
(
key
)[
0
]
in
pserver_vars
:
op_on_pserver
=
True
new_outputs
[
key
]
=
pserver_vars
[
op
.
output
(
key
)[
0
]]
if
op_on_pserver
:
# most startup program ops have no inputs
...
...
@@ -1026,7 +1103,6 @@ class DistributeTranspiler(object):
var_mapping
[
varname
]
=
\
[
program
.
global_block
().
var
(
orig_var
.
name
)]
continue
var_mapping
[
varname
]
=
[]
orig_shape
=
orig_var
.
shape
orig_dim1_flatten
=
1
...
...
tools/manylinux1/Dockerfile.x64
浏览文件 @
e398348e
...
...
@@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz
freetype-devel libpng-devel
graphviz
COPY build_scripts /build_scripts
RUN bash build_scripts/build.sh && \
bash build_scripts/install_nccl2.sh && rm -r build_scripts
...
...
tools/manylinux1/build_scripts/build.sh
浏览文件 @
e398348e
...
...
@@ -105,7 +105,7 @@ curl-config --features
rm
-rf
/usr/local/ssl
# Install patchelf (latest with unreleased bug fixes)
curl
-sLO
http
s
://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
curl
-sLO
http://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
check_sha256sum patchelf-0.9njs2.tar.gz
$PATCHELF_HASH
tar
-xzf
patchelf-0.9njs2.tar.gz
(
cd
patchelf-0.9njs2
&&
./configure
&&
make
&&
make
install
)
...
...
tools/manylinux1/build_scripts/install_nccl2.sh
浏览文件 @
e398348e
...
...
@@ -21,5 +21,5 @@ for sub_deb in $DEBS; do
ar x
$sub_deb
&&
tar
xf data.tar.xz
done
mv
-f
usr/include/nccl.h /usr/local/include/
mv
-f
usr/lib/libnccl
*
/usr/local/lib/
mv
-f
usr/lib/
x86_64-linux-gnu/
libnccl
*
/usr/local/lib/
rm
-rf
$DIR
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录