Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
cf1944af
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
cf1944af
编写于
11月 02, 2018
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
差异文件
test=develop
上级
2494ca83
2eaa291e
变更
65
显示空白变更内容
内联
并排
Showing
65 changed file
with
1452 addition
and
608 deletion
+1452
-608
CMakeLists.txt
CMakeLists.txt
+0
-2
README.md
README.md
+11
-11
paddle/CMakeLists.txt
paddle/CMakeLists.txt
+1
-0
paddle/fluid/API.spec
paddle/fluid/API.spec
+3
-1
paddle/fluid/CMakeLists.txt
paddle/fluid/CMakeLists.txt
+3
-5
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+1
-0
paddle/fluid/framework/details/all_reduce_op_handle.cc
paddle/fluid/framework/details/all_reduce_op_handle.cc
+3
-3
paddle/fluid/framework/details/broadcast_op_handle.h
paddle/fluid/framework/details/broadcast_op_handle.h
+2
-1
paddle/fluid/framework/details/broadcast_op_handle_test.cc
paddle/fluid/framework/details/broadcast_op_handle_test.cc
+1
-221
paddle/fluid/framework/details/broadcast_op_handle_test.h
paddle/fluid/framework/details/broadcast_op_handle_test.h
+271
-0
paddle/fluid/framework/details/computation_op_handle.cc
paddle/fluid/framework/details/computation_op_handle.cc
+1
-1
paddle/fluid/framework/details/data_balance_op_handle.cc
paddle/fluid/framework/details/data_balance_op_handle.cc
+3
-3
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
...uid/framework/details/fast_threaded_ssa_graph_executor.cc
+8
-8
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
...luid/framework/details/fast_threaded_ssa_graph_executor.h
+2
-1
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
...fluid/framework/details/fused_broadcast_op_handle_test.cc
+165
-0
paddle/fluid/framework/details/gather_op_handle.cc
paddle/fluid/framework/details/gather_op_handle.cc
+2
-2
paddle/fluid/framework/details/op_handle_base.cc
paddle/fluid/framework/details/op_handle_base.cc
+1
-1
paddle/fluid/framework/details/reduce_op_handle.cc
paddle/fluid/framework/details/reduce_op_handle.cc
+1
-1
paddle/fluid/framework/details/reduce_op_handle.h
paddle/fluid/framework/details/reduce_op_handle.h
+2
-1
paddle/fluid/framework/details/rpc_op_handle.cc
paddle/fluid/framework/details/rpc_op_handle.cc
+1
-1
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+4
-4
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+8
-9
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+1
-1
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+17
-4
paddle/fluid/framework/lod_tensor.cc
paddle/fluid/framework/lod_tensor.cc
+1
-1
paddle/fluid/framework/lod_tensor_array.h
paddle/fluid/framework/lod_tensor_array.h
+0
-74
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+16
-15
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+1
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+2
-4
paddle/fluid/framework/tensor_test.cc
paddle/fluid/framework/tensor_test.cc
+13
-0
paddle/fluid/inference/CMakeLists.txt
paddle/fluid/inference/CMakeLists.txt
+3
-0
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+11
-16
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+10
-34
paddle/fluid/inference/api/api_impl_tester.cc
paddle/fluid/inference/api/api_impl_tester.cc
+8
-6
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
...luid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+2
-2
paddle/fluid/inference/api/demo_ci/run.sh
paddle/fluid/inference/api/demo_ci/run.sh
+1
-1
paddle/fluid/inference/test.cmake
paddle/fluid/inference/test.cmake
+31
-0
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+0
-14
paddle/fluid/operators/add_position_encoding_op.cc
paddle/fluid/operators/add_position_encoding_op.cc
+97
-0
paddle/fluid/operators/add_position_encoding_op.h
paddle/fluid/operators/add_position_encoding_op.h
+105
-0
paddle/fluid/operators/gather_op.cc
paddle/fluid/operators/gather_op.cc
+4
-2
paddle/fluid/operators/gather_op.cu
paddle/fluid/operators/gather_op.cu
+8
-2
paddle/fluid/operators/math/sequence_pooling.cc
paddle/fluid/operators/math/sequence_pooling.cc
+44
-4
paddle/fluid/operators/math/sequence_pooling.cu
paddle/fluid/operators/math/sequence_pooling.cu
+1
-1
paddle/fluid/operators/math/sequence_pooling.h
paddle/fluid/operators/math/sequence_pooling.h
+1
-1
paddle/fluid/operators/sequence_pool_op.cc
paddle/fluid/operators/sequence_pool_op.cc
+1
-0
paddle/fluid/operators/sequence_pool_op.h
paddle/fluid/operators/sequence_pool_op.h
+11
-6
paddle/fluid/operators/sum_op.cc
paddle/fluid/operators/sum_op.cc
+9
-5
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+17
-19
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+3
-4
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+3
-7
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+105
-4
python/paddle/fluid/metrics.py
python/paddle/fluid/metrics.py
+1
-1
python/paddle/fluid/tests/CMakeLists.txt
python/paddle/fluid/tests/CMakeLists.txt
+0
-2
python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
...s/book/high-level-api/image_classification/CMakeLists.txt
+16
-4
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+8
-1
python/paddle/fluid/tests/unittests/dist_mnist.py
python/paddle/fluid/tests/unittests/dist_mnist.py
+4
-2
python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
...le/fluid/tests/unittests/test_add_position_encoding_op.py
+134
-0
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+26
-41
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+3
-2
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+118
-46
python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
...n/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+61
-4
python/paddle/fluid/tests/unittests/test_seq_pool.py
python/paddle/fluid/tests/unittests/test_seq_pool.py
+14
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+19
-2
python/paddle/fluid/transpiler/inference_transpiler.py
python/paddle/fluid/transpiler/inference_transpiler.py
+28
-0
未找到文件。
CMakeLists.txt
浏览文件 @
cf1944af
...
...
@@ -62,13 +62,11 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option
(
USE_EIGEN_FOR_BLAS
"Use matrix multiplication in Eigen"
OFF
)
option
(
EIGEN_USE_THREADS
"Compile with multi-threaded Eigen"
OFF
)
option
(
WITH_ARM_FP16
"Use half precision support on armv8.2-a cpu"
OFF
)
option
(
WITH_FAST_BUNDLE_TEST
"Bundle tests that can be run in a single process together to reduce launch overhead"
OFF
)
option
(
WITH_CONTRIB
"Compile the third-party contributation"
OFF
)
option
(
REPLACE_ENFORCE_GLOG
"Replace PADDLE_ENFORCE with glog/CHECK for better debug."
OFF
)
option
(
WITH_ANAKIN
"Compile with Anakin library"
OFF
)
option
(
WITH_GRPC
"Use grpc as the default rpc framework"
${
WITH_DISTRIBUTE
}
)
option
(
WITH_BRPC_RDMA
"Use brpc rdma as the rpc protocal"
OFF
)
option
(
WITH_INFERENCE
"Compile fluid inference library"
ON
)
option
(
ON_INFER
"Turn on inference optimization."
OFF
)
option
(
WITH_INFERENCE_API_TEST
"Test fluid inference high-level api interface"
OFF
)
option
(
WITH_SYSTEM_BLAS
"Use system blas library"
OFF
)
...
...
README.md
浏览文件 @
cf1944af
...
...
@@ -2,8 +2,8 @@
[

](https://travis-ci.org/PaddlePaddle/Paddle)
[

](http://paddlepaddle.org/documentation/docs/en/1.
0
/getstarted/index_en.html)
[

](http://paddlepaddle.org/documentation/docs/zh/1.
0
/beginners_guide/index.html)
[

](http://paddlepaddle.org/documentation/docs/en/1.
1
/getstarted/index_en.html)
[

](http://paddlepaddle.org/documentation/docs/zh/1.
1
/beginners_guide/index.html)
[

](https://github.com/PaddlePaddle/Paddle/releases)
[

](LICENSE)
...
...
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our
[
release announcement
](
https://github.com/PaddlePaddle/Paddle/releases
)
to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 1.
0.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0
)
### Latest PaddlePaddle Release: [Fluid 1.
1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1
)
### Install Latest Stable Release:
```
# Linux CPU
...
...
@@ -27,9 +27,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.
0.1
.post87
pip install paddlepaddle-gpu==1.
1.0
.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.
0.1
.post85
pip install paddlepaddle-gpu==1.
1.0
.post85
# For installation on other platform, refer to http://paddlepaddle.org/
```
...
...
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.0.1.post85
## Installation
It is recommended to read
[
this doc
](
http://paddlepaddle.org/documentation/docs/zh/1.
0
/beginners_guide/index.html
)
on our website.
It is recommended to read
[
this doc
](
http://paddlepaddle.org/documentation/docs/zh/1.
1
/beginners_guide/index.html
)
on our website.
## Documentation
We provide
[
English
](
http://paddlepaddle.org/documentation/docs/en/1.
0.0
/getstarted/index_en.html
)
and
[
Chinese
](
http://paddlepaddle.org/documentation/docs/zh/1.
0
/beginners_guide/index.html
)
documentation.
We provide
[
English
](
http://paddlepaddle.org/documentation/docs/en/1.
1
/getstarted/index_en.html
)
and
[
Chinese
](
http://paddlepaddle.org/documentation/docs/zh/1.
1
/beginners_guide/index.html
)
documentation.
-
[
Deep Learning 101
](
https://github.com/PaddlePaddle/book
)
You might want to start from this online interactive book that can run in a Jupyter Notebook.
-
[
Distributed Training
](
http://paddlepaddle.org/documentation/docs/zh/1.
0
/user_guides/howto/training/cluster_howto.html
)
-
[
Distributed Training
](
http://paddlepaddle.org/documentation/docs/zh/1.
1
/user_guides/howto/training/cluster_howto.html
)
You can run distributed training jobs on MPI clusters.
-
[
Python API
](
http://paddlepaddle.org/documentation/api/zh/1.
0
/fluid.html
)
-
[
Python API
](
http://paddlepaddle.org/documentation/api/zh/1.
1
/fluid.html
)
Our new API enables much shorter programs.
-
[
How to Contribute
](
http://paddlepaddle.org/documentation/docs/zh/1.
0
/advanced_usage/development/contribute_to_paddle.html
)
-
[
How to Contribute
](
http://paddlepaddle.org/documentation/docs/zh/1.
1
/advanced_usage/development/contribute_to_paddle.html
)
We appreciate your contributions!
...
...
paddle/CMakeLists.txt
浏览文件 @
cf1944af
...
...
@@ -24,6 +24,7 @@ if(NOT WITH_FLUID_ONLY)
endif
()
add_subdirectory
(
testing
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests CACHE INTERNAL
"python tests directory"
)
if
(
NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API
)
add_subdirectory
(
fluid
)
endif
()
paddle/fluid/API.spec
浏览文件 @
cf1944af
...
...
@@ -64,7 +64,7 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', '
paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'
], varargs=None, keywords=None, defaults=None
)
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'
, 'is_test'], varargs=None, keywords=None, defaults=(False,)
)
paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
...
...
@@ -177,6 +177,8 @@ paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, k
paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
...
...
paddle/fluid/CMakeLists.txt
浏览文件 @
cf1944af
...
...
@@ -9,8 +9,6 @@ add_subdirectory(pybind)
add_subdirectory
(
recordio
)
endif
(
NOT WIN32
)
if
(
WITH_INFERENCE
)
# NOTE: please add subdirectory inference at last.
add_subdirectory
(
inference
)
add_subdirectory
(
train
)
endif
()
# NOTE: please add subdirectory inference at last.
add_subdirectory
(
inference
)
add_subdirectory
(
train
)
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
cf1944af
...
...
@@ -58,6 +58,7 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
# device_context reduce_op_handle )
cc_library
(
fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context
)
cc_test
(
fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle
)
cc_library
(
build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
...
...
paddle/fluid/framework/details/all_reduce_op_handle.cc
浏览文件 @
cf1944af
...
...
@@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
nccl_ctxs_
(
ctxs
)
{
if
(
nccl_ctxs_
)
{
for
(
auto
&
p
:
places_
)
{
this
->
dev_ctxes_
[
p
]
=
nccl_ctxs_
->
DevCtx
(
p
);
this
->
SetDeviceContext
(
p
,
nccl_ctxs_
->
DevCtx
(
p
)
);
}
}
}
...
...
@@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif
void
AllReduceOpHandle
::
RunImpl
()
{
platform
::
RecordEvent
record_event
(
Name
(),
dev_ctxes_
.
begin
()
->
second
);
platform
::
RecordEvent
record_event
(
Name
(),
dev_ctxes_
.
c
begin
()
->
second
);
if
(
NoDummyInputSize
()
==
1
)
{
return
;
// No need to all reduce when GPU count = 1;
...
...
@@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() {
*
local_scopes_
[
i
]
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
&
p
=
places_
[
i
];
auto
*
var
=
scope
.
FindVar
(
out_var_handles
[
i
]
->
name_
);
auto
*
dev_ctx
=
dev_ctxes_
[
p
]
;
auto
*
dev_ctx
=
dev_ctxes_
.
at
(
p
)
;
RunAndRecordEvent
(
p
,
[
&
trg
,
var
,
dev_ctx
,
p
]
{
auto
&
tensor_gpu
=
*
var
->
GetMutable
<
framework
::
LoDTensor
>
();
...
...
paddle/fluid/framework/details/broadcast_op_handle.h
浏览文件 @
cf1944af
...
...
@@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase {
nccl_ctxs_
(
nccl_ctxs
)
{
if
(
nccl_ctxs_
)
{
for
(
auto
&
p_ctx
:
nccl_ctxs_
->
contexts_
)
{
dev_ctxes_
[
platform
::
CUDAPlace
(
p_ctx
.
first
)]
=
p_ctx
.
second
.
ctx_
.
get
();
this
->
SetDeviceContext
(
platform
::
CUDAPlace
(
p_ctx
.
first
),
p_ctx
.
second
.
ctx_
.
get
());
}
}
}
...
...
paddle/fluid/framework/details/broadcast_op_handle_test.cc
浏览文件 @
cf1944af
...
...
@@ -12,232 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
struct
TestBroadcastOpHandle
{
std
::
vector
<
std
::
unique_ptr
<
p
::
DeviceContext
>>
ctxs_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
param_scopes_
;
Scope
g_scope_
;
std
::
unique_ptr
<
OpHandleBase
>
op_handle_
;
std
::
vector
<
std
::
unique_ptr
<
VarHandleBase
>>
vars_
;
std
::
vector
<
p
::
Place
>
gpu_list_
;
bool
use_gpu_
;
#ifdef PADDLE_WITH_CUDA
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
void
WaitAll
()
{
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
ctxs_
[
j
]
->
Wait
();
}
#ifdef PADDLE_WITH_CUDA
if
(
nccl_ctxs_
)
{
nccl_ctxs_
->
WaitAll
();
}
#endif
}
void
InitCtxOnGpu
(
bool
use_gpu
)
{
use_gpu_
=
use_gpu
;
if
(
use_gpu_
)
{
#ifdef PADDLE_WITH_CUDA
int
count
=
p
::
GetCUDADeviceCount
();
if
(
count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
"device count is "
<<
count
;
exit
(
0
);
}
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
auto
p
=
p
::
CUDAPlace
(
i
);
gpu_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CUDADeviceContext
(
p
));
}
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
gpu_list_
));
#else
PADDLE_THROW
(
"CUDA is not support."
);
#endif
}
else
{
int
count
=
8
;
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
auto
p
=
p
::
CPUPlace
();
gpu_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CPUDeviceContext
(
p
));
}
#ifdef PADDLE_WITH_CUDA
nccl_ctxs_
.
reset
(
nullptr
);
#endif
}
}
void
InitBroadcastOp
(
size_t
input_scope_idx
)
{
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
local_scopes_
.
push_back
(
&
(
g_scope_
.
NewScope
()));
Scope
&
local_scope
=
local_scopes_
.
back
()
->
NewScope
();
*
local_scopes_
.
back
()
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
()
=
&
local_scope
;
local_scope
.
Var
(
"out"
);
param_scopes_
.
emplace_back
(
&
local_scope
);
}
param_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
std
::
unique_ptr
<
ir
::
Node
>
n
=
ir
::
CreateNodeForTest
(
"node0"
,
ir
::
Node
::
Type
::
kOperation
);
if
(
use_gpu_
)
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
BroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
gpu_list_
,
nccl_ctxs_
.
get
()));
#else
PADDLE_THROW
(
"CUDA is not support."
);
#endif
}
else
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
BroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
gpu_list_
,
nccl_ctxs_
.
get
()));
#else
op_handle_
.
reset
(
new
BroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
gpu_list_
));
#endif
}
std
::
unique_ptr
<
ir
::
Node
>
v
=
ir
::
CreateNodeForTest
(
"node1"
,
ir
::
Node
::
Type
::
kVariable
);
auto
*
in_var_handle
=
new
VarHandle
(
v
.
get
(),
1
,
input_scope_idx
,
"input"
,
gpu_list_
[
input_scope_idx
]);
vars_
.
emplace_back
(
in_var_handle
);
op_handle_
->
AddInput
(
in_var_handle
);
// add dummy var
std
::
unique_ptr
<
ir
::
Node
>
v2
=
ir
::
CreateNodeForTest
(
"node2"
,
ir
::
Node
::
Type
::
kVariable
);
vars_
.
emplace_back
(
new
DummyVarHandle
(
v2
.
get
()));
DummyVarHandle
*
dummy_var_handle
=
static_cast
<
DummyVarHandle
*>
(
vars_
.
back
().
get
());
dummy_var_handle
->
ClearGeneratedOp
();
op_handle_
->
AddInput
(
dummy_var_handle
);
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
if
(
!
use_gpu_
)
{
op_handle_
->
SetDeviceContext
(
gpu_list_
[
j
],
ctxs_
[
j
].
get
());
}
std
::
unique_ptr
<
ir
::
Node
>
v3
=
ir
::
CreateNodeForTest
(
"node3"
,
ir
::
Node
::
Type
::
kVariable
);
VarHandle
*
out_var_handle
=
new
VarHandle
(
v3
.
get
(),
2
,
j
,
"out"
,
gpu_list_
[
j
]);
vars_
.
emplace_back
(
out_var_handle
);
op_handle_
->
AddOutput
(
out_var_handle
);
}
// add dummy var
std
::
unique_ptr
<
ir
::
Node
>
v4
=
ir
::
CreateNodeForTest
(
"node4"
,
ir
::
Node
::
Type
::
kVariable
);
vars_
.
emplace_back
(
new
DummyVarHandle
(
v4
.
get
()));
DummyVarHandle
*
out_dummy_var_handle
=
static_cast
<
DummyVarHandle
*>
(
vars_
.
back
().
get
());
out_dummy_var_handle
->
ClearGeneratedOp
();
op_handle_
->
AddOutput
(
out_dummy_var_handle
);
}
void
TestBroadcastLodTensor
(
size_t
input_scope_idx
)
{
auto
in_var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
"input"
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
auto
in_lod_tensor
=
in_var
->
GetMutable
<
f
::
LoDTensor
>
();
in_lod_tensor
->
mutable_data
<
float
>
(
kDims
,
gpu_list_
[
input_scope_idx
]);
std
::
vector
<
float
>
send_vector
(
static_cast
<
size_t
>
(
f
::
product
(
kDims
)));
for
(
size_t
k
=
0
;
k
<
send_vector
.
size
();
++
k
)
{
send_vector
[
k
]
=
k
;
}
f
::
LoD
lod
{{
0
,
10
,
20
}};
paddle
::
framework
::
TensorFromVector
<
float
>
(
send_vector
,
*
(
ctxs_
[
input_scope_idx
]),
in_lod_tensor
);
in_lod_tensor
->
set_lod
(
lod
);
in_lod_tensor
->
Resize
(
kDims
);
op_handle_
->
Run
(
false
);
WaitAll
();
p
::
CPUPlace
cpu_place
;
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
auto
out_var
=
param_scopes_
[
j
]
->
FindVar
(
"out"
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
auto
out_tensor
=
out_var
->
Get
<
f
::
LoDTensor
>
();
PADDLE_ENFORCE_EQ
(
out_tensor
.
lod
(),
lod
,
"lod is not equal."
);
f
::
Tensor
result_tensor
;
f
::
TensorCopySync
(
out_tensor
,
cpu_place
,
&
result_tensor
);
float
*
ct
=
result_tensor
.
mutable_data
<
float
>
(
cpu_place
);
for
(
int64_t
i
=
0
;
i
<
f
::
product
(
kDims
);
++
i
)
{
ASSERT_NEAR
(
ct
[
i
],
send_vector
[
i
],
1e-5
);
}
}
}
void
TestBroadcastSelectedRows
(
size_t
input_scope_idx
)
{
auto
in_var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
"input"
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
auto
in_selected_rows
=
in_var
->
GetMutable
<
f
::
SelectedRows
>
();
auto
value
=
in_selected_rows
->
mutable_value
();
value
->
mutable_data
<
float
>
(
kDims
,
gpu_list_
[
input_scope_idx
]);
int
height
=
static_cast
<
int
>
(
kDims
[
0
])
*
2
;
std
::
vector
<
int64_t
>
rows
{
0
,
1
,
2
,
3
,
3
,
0
,
14
,
7
,
3
,
1
,
2
,
4
,
6
,
3
,
1
,
1
,
1
,
1
,
3
,
7
};
in_selected_rows
->
set_height
(
height
);
in_selected_rows
->
set_rows
(
rows
);
std
::
vector
<
float
>
send_vector
(
static_cast
<
size_t
>
(
f
::
product
(
kDims
)));
for
(
size_t
k
=
0
;
k
<
send_vector
.
size
();
++
k
)
{
send_vector
[
k
]
=
k
;
}
paddle
::
framework
::
TensorFromVector
<
float
>
(
send_vector
,
*
(
ctxs_
[
input_scope_idx
]),
value
);
op_handle_
->
Run
(
false
);
WaitAll
();
p
::
CPUPlace
cpu_place
;
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
auto
out_var
=
param_scopes_
[
j
]
->
FindVar
(
"out"
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
auto
&
out_select_rows
=
out_var
->
Get
<
f
::
SelectedRows
>
();
auto
rt
=
out_select_rows
.
value
();
PADDLE_ENFORCE_EQ
(
out_select_rows
.
height
(),
height
,
"height is not equal."
);
for
(
size_t
k
=
0
;
k
<
out_select_rows
.
rows
().
size
();
++
k
)
{
PADDLE_ENFORCE_EQ
(
out_select_rows
.
rows
()[
k
],
rows
[
k
]);
}
f
::
Tensor
result_tensor
;
f
::
TensorCopySync
(
rt
,
cpu_place
,
&
result_tensor
);
float
*
ct
=
result_tensor
.
data
<
float
>
();
for
(
int64_t
i
=
0
;
i
<
f
::
product
(
kDims
);
++
i
)
{
ASSERT_NEAR
(
ct
[
i
],
send_vector
[
i
],
1e-5
);
}
}
}
};
TEST
(
BroadcastTester
,
TestCPUBroadcastTestLodTensor
)
{
TestBroadcastOpHandle
test_op
;
size_t
input_scope_idx
=
0
;
...
...
paddle/fluid/framework/details/broadcast_op_handle_test.h
0 → 100644
浏览文件 @
cf1944af
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
struct
TestBroadcastOpHandle
{
std
::
vector
<
std
::
unique_ptr
<
p
::
DeviceContext
>>
ctxs_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
param_scopes_
;
Scope
g_scope_
;
std
::
unique_ptr
<
OpHandleBase
>
op_handle_
;
std
::
vector
<
std
::
unique_ptr
<
VarHandleBase
>>
vars_
;
std
::
vector
<
p
::
Place
>
place_list_
;
bool
use_gpu_
;
#ifdef PADDLE_WITH_CUDA
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
void
WaitAll
()
{
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
ctxs_
[
j
]
->
Wait
();
}
#ifdef PADDLE_WITH_CUDA
if
(
nccl_ctxs_
)
{
nccl_ctxs_
->
WaitAll
();
}
#endif
}
void
InitCtxOnGpu
(
bool
use_gpu
)
{
use_gpu_
=
use_gpu
;
if
(
use_gpu_
)
{
#ifdef PADDLE_WITH_CUDA
int
count
=
p
::
GetCUDADeviceCount
();
if
(
count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
"device count is "
<<
count
;
exit
(
0
);
}
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
auto
p
=
p
::
CUDAPlace
(
i
);
place_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CUDADeviceContext
(
p
));
}
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
place_list_
));
#else
PADDLE_THROW
(
"CUDA is not support."
);
#endif
}
else
{
int
count
=
8
;
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
auto
p
=
p
::
CPUPlace
();
place_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CPUDeviceContext
(
p
));
}
#ifdef PADDLE_WITH_CUDA
nccl_ctxs_
.
reset
(
nullptr
);
#endif
}
}
void
InitBroadcastOp
(
size_t
input_scope_idx
)
{
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
local_scopes_
.
push_back
(
&
(
g_scope_
.
NewScope
()));
Scope
&
local_scope
=
local_scopes_
.
back
()
->
NewScope
();
*
local_scopes_
.
back
()
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
()
=
&
local_scope
;
local_scope
.
Var
(
"out"
);
param_scopes_
.
emplace_back
(
&
local_scope
);
}
param_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
std
::
unique_ptr
<
ir
::
Node
>
n
=
ir
::
CreateNodeForTest
(
"node0"
,
ir
::
Node
::
Type
::
kOperation
);
if
(
use_gpu_
)
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
BroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
()));
#else
PADDLE_THROW
(
"CUDA is not support."
);
#endif
}
else
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
BroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
()));
#else
op_handle_
.
reset
(
new
BroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
place_list_
));
#endif
}
std
::
unique_ptr
<
ir
::
Node
>
v
=
ir
::
CreateNodeForTest
(
"node1"
,
ir
::
Node
::
Type
::
kVariable
);
auto
*
in_var_handle
=
new
VarHandle
(
v
.
get
(),
1
,
input_scope_idx
,
"input"
,
place_list_
[
input_scope_idx
]);
vars_
.
emplace_back
(
in_var_handle
);
op_handle_
->
AddInput
(
in_var_handle
);
// add dummy var
std
::
unique_ptr
<
ir
::
Node
>
v2
=
ir
::
CreateNodeForTest
(
"node2"
,
ir
::
Node
::
Type
::
kVariable
);
vars_
.
emplace_back
(
new
DummyVarHandle
(
v2
.
get
()));
DummyVarHandle
*
dummy_var_handle
=
static_cast
<
DummyVarHandle
*>
(
vars_
.
back
().
get
());
dummy_var_handle
->
ClearGeneratedOp
();
op_handle_
->
AddInput
(
dummy_var_handle
);
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
if
(
!
use_gpu_
)
{
op_handle_
->
SetDeviceContext
(
place_list_
[
j
],
ctxs_
[
j
].
get
());
}
std
::
unique_ptr
<
ir
::
Node
>
v3
=
ir
::
CreateNodeForTest
(
"node3"
,
ir
::
Node
::
Type
::
kVariable
);
VarHandle
*
out_var_handle
=
new
VarHandle
(
v3
.
get
(),
2
,
j
,
"out"
,
place_list_
[
j
]);
vars_
.
emplace_back
(
out_var_handle
);
op_handle_
->
AddOutput
(
out_var_handle
);
}
// add dummy var
std
::
unique_ptr
<
ir
::
Node
>
v4
=
ir
::
CreateNodeForTest
(
"node4"
,
ir
::
Node
::
Type
::
kVariable
);
vars_
.
emplace_back
(
new
DummyVarHandle
(
v4
.
get
()));
DummyVarHandle
*
out_dummy_var_handle
=
static_cast
<
DummyVarHandle
*>
(
vars_
.
back
().
get
());
out_dummy_var_handle
->
ClearGeneratedOp
();
op_handle_
->
AddOutput
(
out_dummy_var_handle
);
}
std
::
vector
<
float
>
InitLoDTensor
(
const
std
::
string
&
varname
,
size_t
input_scope_idx
,
const
f
::
LoD
&
lod
,
float
val_scalar
=
0.0
)
{
auto
var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
varname
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
auto
lod_tensor
=
var
->
GetMutable
<
f
::
LoDTensor
>
();
std
::
vector
<
float
>
send_vector
(
static_cast
<
size_t
>
(
f
::
product
(
kDims
)));
for
(
size_t
k
=
0
;
k
<
send_vector
.
size
();
++
k
)
{
send_vector
[
k
]
=
k
+
val_scalar
;
}
paddle
::
framework
::
TensorFromVector
<
float
>
(
send_vector
,
*
(
ctxs_
[
input_scope_idx
]),
lod_tensor
);
lod_tensor
->
set_lod
(
lod
);
lod_tensor
->
Resize
(
kDims
);
return
send_vector
;
}
std
::
vector
<
float
>
InitSelectedRows
(
const
std
::
string
&
varname
,
size_t
input_scope_idx
,
const
std
::
vector
<
int64_t
>&
rows
,
int
height
,
float
value_scalar
=
0.0
)
{
std
::
vector
<
float
>
send_vector
(
static_cast
<
size_t
>
(
f
::
product
(
kDims
)));
for
(
size_t
k
=
0
;
k
<
send_vector
.
size
();
++
k
)
{
send_vector
[
k
]
=
k
+
value_scalar
;
}
auto
var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
varname
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
auto
selected_rows
=
var
->
GetMutable
<
f
::
SelectedRows
>
();
auto
value
=
selected_rows
->
mutable_value
();
value
->
mutable_data
<
float
>
(
kDims
,
place_list_
[
input_scope_idx
]);
selected_rows
->
set_height
(
height
);
selected_rows
->
set_rows
(
rows
);
paddle
::
framework
::
TensorFromVector
<
float
>
(
send_vector
,
*
(
ctxs_
[
input_scope_idx
]),
value
);
return
send_vector
;
}
void
SelectedRowsEqual
(
const
std
::
string
&
varname
,
int
input_scope_idx
,
const
std
::
vector
<
float
>&
send_vector
,
const
std
::
vector
<
int64_t
>&
rows
,
int
height
)
{
auto
var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
varname
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
auto
&
selected_rows
=
var
->
Get
<
f
::
SelectedRows
>
();
auto
rt
=
selected_rows
.
value
();
PADDLE_ENFORCE_EQ
(
selected_rows
.
height
(),
height
,
"height is not equal."
);
for
(
size_t
k
=
0
;
k
<
selected_rows
.
rows
().
size
();
++
k
)
{
PADDLE_ENFORCE_EQ
(
selected_rows
.
rows
()[
k
],
rows
[
k
]);
}
p
::
CPUPlace
cpu_place
;
f
::
Tensor
result_tensor
;
f
::
TensorCopySync
(
rt
,
cpu_place
,
&
result_tensor
);
float
*
ct
=
result_tensor
.
data
<
float
>
();
for
(
int64_t
i
=
0
;
i
<
f
::
product
(
kDims
);
++
i
)
{
ASSERT_NEAR
(
ct
[
i
],
send_vector
[
i
],
1e-5
);
}
}
void
LoDTensorEqual
(
const
std
::
string
&
varname
,
const
std
::
vector
<
float
>&
send_vec
,
const
f
::
LoD
&
lod
,
framework
::
Scope
*
scope
)
{
p
::
CPUPlace
cpu_place
;
auto
var
=
scope
->
FindVar
(
varname
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
auto
tensor
=
var
->
Get
<
f
::
LoDTensor
>
();
PADDLE_ENFORCE_EQ
(
tensor
.
lod
(),
lod
,
"lod is not equal."
);
f
::
Tensor
result_tensor
;
f
::
TensorCopySync
(
tensor
,
cpu_place
,
&
result_tensor
);
float
*
ct
=
result_tensor
.
mutable_data
<
float
>
(
cpu_place
);
for
(
int64_t
k
=
0
;
k
<
f
::
product
(
kDims
);
++
k
)
{
ASSERT_NEAR
(
ct
[
k
],
send_vec
[
k
],
1e-5
);
}
}
void
TestBroadcastLodTensor
(
size_t
input_scope_idx
)
{
f
::
LoD
lod
{{
0
,
10
,
20
}};
auto
send_vector
=
InitLoDTensor
(
"input"
,
input_scope_idx
,
lod
);
op_handle_
->
Run
(
false
);
WaitAll
();
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
LoDTensorEqual
(
"out"
,
send_vector
,
lod
,
param_scopes_
[
j
]);
}
}
void
TestBroadcastSelectedRows
(
size_t
input_scope_idx
)
{
std
::
vector
<
int64_t
>
rows
{
0
,
1
,
2
,
3
,
3
,
0
,
14
,
7
,
3
,
1
,
2
,
4
,
6
,
3
,
1
,
1
,
1
,
1
,
3
,
7
};
int
height
=
static_cast
<
int
>
(
kDims
[
0
]
*
2
);
auto
send_vector
=
InitSelectedRows
(
"input"
,
input_scope_idx
,
rows
,
height
);
op_handle_
->
Run
(
false
);
WaitAll
();
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
SelectedRowsEqual
(
"out"
,
input_scope_idx
,
send_vector
,
rows
,
height
);
}
}
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/computation_op_handle.cc
浏览文件 @
cf1944af
...
...
@@ -37,7 +37,7 @@ void ComputationOpHandle::RunImpl() {
bool
ComputationOpHandle
::
NeedWait
(
VarHandleBase
*
in_var
)
{
bool
need_wait
=
in_var
&&
in_var
->
GeneratedOp
()
&&
in_var
->
GeneratedOp
()
->
DeviceContext
(
place_
)
!=
dev_ctxes_
[
place_
]
;
in_var
->
GeneratedOp
()
->
DeviceContext
(
place_
)
!=
dev_ctxes_
.
at
(
place_
)
;
return
need_wait
;
}
...
...
paddle/fluid/framework/details/data_balance_op_handle.cc
浏览文件 @
cf1944af
...
...
@@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle(
:
OpHandleBase
(
node
),
local_scopes_
(
local_scopes
),
places_
(
places
)
{
if
(
ctxs
)
{
for
(
auto
&
p
:
places_
)
{
this
->
dev_ctxes_
[
p
]
=
ctxs
->
DevCtx
(
p
);
this
->
SetDeviceContext
(
p
,
ctxs
->
DevCtx
(
p
)
);
}
}
}
...
...
@@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() {
PADDLE_ENFORCE_GT
(
places_
.
size
(),
1
,
"Data balance can only be enabled when the number of "
"places to run larger than 1."
);
auto
in_var_handles
=
DynamicCast
<
VarHandle
>
(
inputs_
);
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
outputs_
);
auto
in_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Inputs
()
);
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Outputs
()
);
PADDLE_ENFORCE
(
in_var_handles
.
size
()
%
places_
.
size
()
==
0
);
PADDLE_ENFORCE_EQ
(
in_var_handles
.
size
(),
out_var_handles
.
size
(),
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
浏览文件 @
cf1944af
...
...
@@ -92,13 +92,13 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
size_t
num_complete
=
0
;
remaining_
=
0
;
BlockingQueue
<
size_t
>
complete_q
;
auto
complete_q
=
std
::
make_shared
<
BlockingQueue
<
size_t
>>
()
;
for
(
auto
op
:
bootstrap_ops_
)
{
RunOpAsync
(
op_deps
.
get
(),
op
,
&
complete_q
);
RunOpAsync
(
op_deps
.
get
(),
op
,
complete_q
);
}
while
(
num_complete
!=
op_deps
->
size
())
{
size_t
num_comp
=
complete_q
.
Pop
();
size_t
num_comp
=
complete_q
->
Pop
();
if
(
num_comp
==
-
1UL
)
{
int
remaining
=
0
;
while
(
true
)
{
...
...
@@ -107,7 +107,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
break
;
}
for
(
int
i
=
0
;
i
<
remaining
;
++
i
)
{
complete_q
.
Pop
();
complete_q
->
Pop
();
}
}
exception_
.
ReThrow
();
...
...
@@ -120,7 +120,8 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
}
void
FastThreadedSSAGraphExecutor
::
RunOpAsync
(
std
::
unordered_map
<
OpHandleBase
*
,
std
::
atomic
<
int
>>
*
op_deps
,
OpHandleBase
*
op
,
BlockingQueue
<
size_t
>
*
complete_q
)
{
OpHandleBase
*
op
,
const
std
::
shared_ptr
<
BlockingQueue
<
size_t
>>
&
complete_q
)
{
++
remaining_
;
this
->
pool_
.
enqueue
([
=
]
{
OpHandleBase
*
op_to_run
=
op
;
...
...
@@ -144,7 +145,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
if
(
op_to_run
==
nullptr
)
{
op_to_run
=
pending_op
;
}
else
{
this
->
RunOpAsync
(
op_deps
,
pending_op
,
complete_q
);
RunOpAsync
(
op_deps
,
pending_op
,
complete_q
);
}
}
}
...
...
@@ -156,8 +157,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
}
void
FastThreadedSSAGraphExecutor
::
PrepareAtomicOpDeps
()
{
atomic_op_deps_
=
pool_
.
enqueue
([
&
]
{
std
::
unordered_map
<
OpHandleBase
*
,
std
::
atomic
<
int
>>
*
op_deps
=
new
std
::
unordered_map
<
OpHandleBase
*
,
std
::
atomic
<
int
>>
;
auto
*
op_deps
=
new
std
::
unordered_map
<
OpHandleBase
*
,
std
::
atomic
<
int
>>
;
for
(
auto
&
pair
:
op_deps_
)
{
(
*
op_deps
)[
pair
.
first
]
=
pair
.
second
;
}
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
浏览文件 @
cf1944af
...
...
@@ -50,7 +50,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
std
::
atomic
<
int
>
remaining_
;
void
RunOpAsync
(
std
::
unordered_map
<
OpHandleBase
*
,
std
::
atomic
<
int
>>
*
op_deps
,
OpHandleBase
*
op
,
BlockingQueue
<
size_t
>
*
complete_q
);
OpHandleBase
*
op
,
const
std
::
shared_ptr
<
BlockingQueue
<
size_t
>>
&
complete_q
);
void
PrepareAtomicOpDeps
();
...
...
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
0 → 100644
浏览文件 @
cf1944af
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
struct
TestFusedBroadcastOpHandle
:
TestBroadcastOpHandle
{
std
::
vector
<
std
::
string
>
out_varnames_
;
void
InitFusedBroadcastOp
(
std
::
vector
<
size_t
>
input_scope_idxes
)
{
// initialize scope and var
for
(
size_t
i
=
0
;
i
<
place_list_
.
size
();
++
i
)
{
local_scopes_
.
push_back
(
&
(
g_scope_
.
NewScope
()));
Scope
&
local_scope
=
local_scopes_
.
back
()
->
NewScope
();
*
local_scopes_
.
back
()
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
()
=
&
local_scope
;
for
(
size_t
j
=
0
;
j
<
input_scope_idxes
.
size
();
++
j
)
{
local_scope
.
Var
(
"out_var"
+
j
);
if
(
i
==
j
)
local_scope
.
Var
(
"in_var"
+
j
);
}
param_scopes_
.
emplace_back
(
&
local_scope
);
}
// create op handle node
std
::
unique_ptr
<
ir
::
Node
>
n
=
ir
::
CreateNodeForTest
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
);
if
(
use_gpu_
)
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
FusedBroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
()));
#else
PADDLE_THROW
(
"CUDA is not supported."
);
#endif
}
else
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
FusedBroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
()));
#else
op_handle_
.
reset
(
new
FusedBroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
place_list_
));
#endif
}
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
// add input var handle
std
::
unique_ptr
<
ir
::
Node
>
in_node
=
ir
::
CreateNodeForTest
(
"in_node"
+
i
,
ir
::
Node
::
Type
::
kVariable
);
VarHandle
*
in_var_handle
=
new
VarHandle
(
in_node
.
get
(),
1
,
input_scope_idxes
[
i
],
"in_var"
+
i
,
place_list_
[
input_scope_idxes
[
i
]]);
vars_
.
emplace_back
(
in_var_handle
);
op_handle_
->
AddInput
(
in_var_handle
);
// add output var handle
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
std
::
unique_ptr
<
ir
::
Node
>
out_node
=
ir
::
CreateNodeForTest
(
"out_node"
+
i
,
ir
::
Node
::
Type
::
kVariable
);
VarHandle
*
out_var_handle
=
new
VarHandle
(
out_node
.
get
(),
2
,
j
,
"out_var"
+
i
,
place_list_
[
j
]);
vars_
.
emplace_back
(
out_var_handle
);
op_handle_
->
AddOutput
(
out_var_handle
);
}
}
}
void
TestFusedBroadcastLoDTensor
(
std
::
vector
<
size_t
>
input_scope_idxes
)
{
std
::
vector
<
std
::
vector
<
float
>>
send_vec
;
f
::
LoD
lod
{{
0
,
10
,
20
}};
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
const
std
::
string
varname
(
"in_var"
+
i
);
float
val_scalar
=
static_cast
<
float
>
(
i
);
send_vec
.
push_back
(
InitLoDTensor
(
varname
,
input_scope_idxes
[
i
],
lod
,
val_scalar
));
}
op_handle_
->
Run
(
false
);
WaitAll
();
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
const
std
::
string
&
varname
(
"out_var"
+
i
);
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
LoDTensorEqual
(
varname
,
send_vec
[
i
],
lod
,
param_scopes_
[
j
]);
}
}
}
void
TestFusedBroadcastSelectedRows
(
std
::
vector
<
size_t
>
input_scope_idxes
)
{
std
::
vector
<
std
::
vector
<
float
>>
send_vector
;
std
::
vector
<
int64_t
>
rows
{
0
,
1
,
2
,
3
,
3
,
0
,
14
,
7
,
3
,
1
,
2
,
4
,
6
,
3
,
1
,
1
,
1
,
1
,
3
,
7
};
int
height
=
static_cast
<
int
>
(
kDims
[
0
]
*
2
);
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
const
std
::
string
varname
(
"in_var"
+
i
);
float
val_scalar
=
static_cast
<
float
>
(
i
);
send_vector
.
push_back
(
InitSelectedRows
(
varname
,
input_scope_idxes
[
i
],
rows
,
height
,
val_scalar
));
}
op_handle_
->
Run
(
false
);
WaitAll
();
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
const
std
::
string
&
varname
(
"out_var"
+
i
);
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
SelectedRowsEqual
(
varname
,
input_scope_idxes
[
i
],
send_vector
[
i
],
rows
,
height
);
}
}
}
};
TEST
(
FusedBroadcastTester
,
CPULodTensor
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
test_op
.
InitCtxOnGpu
(
false
);
test_op
.
InitFusedBroadcastOp
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastLoDTensor
(
input_scope_idxes
);
}
TEST
(
FusedBroadcastTester
,
CPUSelectedRows
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
test_op
.
InitCtxOnGpu
(
false
);
test_op
.
InitFusedBroadcastOp
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastSelectedRows
(
input_scope_idxes
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
FusedBroadcastTester
,
GPULodTensor
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
test_op
.
InitCtxOnGpu
(
true
);
test_op
.
InitFusedBroadcastOp
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastLoDTensor
(
input_scope_idxes
);
}
TEST
(
FusedBroadcastTester
,
GPUSelectedRows
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
test_op
.
InitCtxOnGpu
(
true
);
test_op
.
InitFusedBroadcastOp
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastSelectedRows
(
input_scope_idxes
);
}
#endif
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/gather_op_handle.cc
浏览文件 @
cf1944af
...
...
@@ -36,7 +36,7 @@ void GatherOpHandle::RunImpl() {
VarHandle
*
out_var_handle
;
{
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
outputs_
);
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Outputs
()
);
PADDLE_ENFORCE_EQ
(
out_var_handles
.
size
(),
1
,
"The number of output should be one."
);
out_var_handle
=
out_var_handles
.
front
();
...
...
@@ -99,7 +99,7 @@ void GatherOpHandle::RunImpl() {
Tensor
*
out_tensor
=
out_value
->
mutable_value
();
// copy
auto
dev_ctx
=
dev_ctxes_
[
out_var_handle
->
place_
]
;
auto
dev_ctx
=
dev_ctxes_
.
at
(
out_var_handle
->
place_
)
;
RunAndRecordEvent
(
out_var_handle
->
place_
,
[
in_tensors
,
out_tensor
,
&
dev_ctx
,
t_out_p
]
{
int
s
=
0
,
e
=
0
;
...
...
paddle/fluid/framework/details/op_handle_base.cc
浏览文件 @
cf1944af
...
...
@@ -103,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
void
OpHandleBase
::
WaitInputVarGenerated
(
const
platform
::
Place
&
place
)
{
for
(
auto
*
in
:
inputs_
)
{
if
(
NeedWait
(
in
))
{
in
->
GeneratedOp
()
->
RecordWaitEventOnCtx
(
dev_ctxes_
[
place
]
);
in
->
GeneratedOp
()
->
RecordWaitEventOnCtx
(
dev_ctxes_
.
at
(
place
)
);
}
}
}
...
...
paddle/fluid/framework/details/reduce_op_handle.cc
浏览文件 @
cf1944af
...
...
@@ -27,7 +27,7 @@ namespace framework {
namespace
details
{
void
ReduceOpHandle
::
RunImpl
()
{
platform
::
RecordEvent
record_event
(
Name
(),
dev_ctxes_
.
begin
()
->
second
);
platform
::
RecordEvent
record_event
(
Name
(),
dev_ctxes_
.
c
begin
()
->
second
);
if
(
places_
.
size
()
==
1
)
return
;
// the input and output may have dummy var.
...
...
paddle/fluid/framework/details/reduce_op_handle.h
浏览文件 @
cf1944af
...
...
@@ -46,7 +46,8 @@ struct ReduceOpHandle : public OpHandleBase {
nccl_ctxs_
(
nccl_ctxs
)
{
if
(
nccl_ctxs_
)
{
for
(
auto
&
p_ctx
:
nccl_ctxs_
->
contexts_
)
{
dev_ctxes_
[
platform
::
CUDAPlace
(
p_ctx
.
first
)]
=
p_ctx
.
second
.
ctx_
.
get
();
this
->
SetDeviceContext
(
platform
::
CUDAPlace
(
p_ctx
.
first
),
p_ctx
.
second
.
ctx_
.
get
());
}
}
}
...
...
paddle/fluid/framework/details/rpc_op_handle.cc
浏览文件 @
cf1944af
...
...
@@ -38,7 +38,7 @@ void RPCOpHandle::RunImpl() {
continue
;
}
if
(
in
->
GeneratedOp
())
{
in
->
GeneratedOp
()
->
RecordWaitEventOnCtx
(
dev_ctxes_
[
p
]
);
in
->
GeneratedOp
()
->
RecordWaitEventOnCtx
(
dev_ctxes_
.
at
(
p
)
);
}
}
auto
&
tmp_scope
=
local_scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
...
...
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
浏览文件 @
cf1944af
...
...
@@ -27,7 +27,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
coeff_
(
static_cast
<
float
>
(
1.0
/
num_dev
)),
scope_
(
scope
),
place_
(
place
)
{
dev_ctxes_
[
place_
]
=
dev_ctx
;
this
->
SetDeviceContext
(
place_
,
dev_ctx
)
;
}
ScaleLossGradOpHandle
::~
ScaleLossGradOpHandle
()
{}
...
...
@@ -46,8 +46,8 @@ void ScaleLossGradOpHandle::RunImpl() {
}
else
{
#ifdef PADDLE_WITH_CUDA
this
->
RunAndRecordEvent
([
&
]
{
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
this
->
dev_ctxes_
[
place_
]
)
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
this
->
dev_ctxes_
.
at
(
place_
)
)
->
stream
();
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
tmp
,
platform
::
CPUPlace
(),
&
coeff_
,
sizeof
(
float
),
stream
);
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
cf1944af
...
...
@@ -39,7 +39,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
new
platform
::
RecordEvent
(
"ThreadedSSAGraphExecutorPrepare"
,
nullptr
));
std
::
unordered_map
<
OpHandleBase
*
,
size_t
>
pending_ops
;
std
::
unordered_set
<
VarHandleBase
*>
pending_vars
;
BlockingQueue
<
VarHandleBase
*>
ready_vars
;
auto
ready_vars
=
std
::
make_shared
<
BlockingQueue
<
VarHandleBase
*>>
()
;
std
::
unordered_set
<
OpHandleBase
*>
ready_ops
;
// For ops (e.g. nccl_all_reduce) that need to coordinate multiple
// streams from multiple GPUs, it's faster to buffer them and schedule
...
...
@@ -51,12 +51,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
for
(
auto
&
var_map
:
graph_
->
Get
<
details
::
GraphVars
>
(
details
::
kGraphVars
))
{
for
(
auto
&
name_pair
:
var_map
)
{
for
(
auto
&
version_pair
:
name_pair
.
second
)
{
InsertPendingVar
(
&
pending_vars
,
&
ready_vars
,
version_pair
.
get
());
InsertPendingVar
(
&
pending_vars
,
ready_vars
.
get
()
,
version_pair
.
get
());
}
}
}
for
(
auto
&
var
:
graph_
->
Get
<
details
::
GraphDepVars
>
(
details
::
kGraphDepVars
))
{
InsertPendingVar
(
&
pending_vars
,
&
ready_vars
,
var
.
get
());
InsertPendingVar
(
&
pending_vars
,
ready_vars
.
get
()
,
var
.
get
());
}
for
(
auto
&
op
:
graph_
->
Get
<
details
::
GraphOps
>
(
details
::
kGraphOps
))
{
...
...
@@ -73,12 +73,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
FeedFetchList
fetch_data
(
fetch_tensors
.
size
());
InsertFetchOps
(
fetch_tensors
,
&
fetch_ops
,
&
fetch_dependencies
,
&
pending_ops
,
&
pending_vars
,
&
ready_vars
,
&
fetch_data
);
&
pending_vars
,
ready_vars
.
get
()
,
&
fetch_data
);
auto
run_all_ops
=
[
&
](
std
::
unordered_set
<
OpHandleBase
*>
&
set
)
{
for
(
auto
*
op
:
set
)
{
running_ops_
++
;
RunOp
(
&
ready_vars
,
op
);
RunOp
(
ready_vars
,
op
);
}
set
.
clear
();
};
...
...
@@ -87,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
run_op_futures_
.
clear
();
exception_holder_
.
Clear
();
event
.
reset
(
nullptr
);
// Step 3. Execution
while
(
!
pending_vars
.
empty
())
{
// 1. Run All Ready ops
...
...
@@ -103,7 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// 2. Find ready variable
bool
timeout
;
auto
cur_ready_vars
=
ready_vars
.
PopAll
(
1
,
&
timeout
);
auto
cur_ready_vars
=
ready_vars
->
PopAll
(
1
,
&
timeout
);
if
(
timeout
)
{
if
(
exception_holder_
.
IsCaught
())
{
...
...
@@ -133,7 +132,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
}
}
PADDLE_ENFORCE
(
ready_ops
.
empty
());
// Wait FetchOps.
ClearFetchOp
(
graph_
.
get
(),
&
fetch_ops
);
...
...
@@ -206,7 +204,8 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
}
void
ThreadedSSAGraphExecutor
::
RunOp
(
BlockingQueue
<
VarHandleBase
*>
*
ready_var_q
,
details
::
OpHandleBase
*
op
)
{
const
std
::
shared_ptr
<
BlockingQueue
<
VarHandleBase
*>>
&
ready_var_q
,
details
::
OpHandleBase
*
op
)
{
auto
op_run
=
[
ready_var_q
,
op
,
this
]
{
try
{
if
(
VLOG_IS_ON
(
10
))
{
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
浏览文件 @
cf1944af
...
...
@@ -51,7 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
~
ThreadedSSAGraphExecutor
()
{}
private:
void
RunOp
(
BlockingQueue
<
VarHandleBase
*>
*
ready_var_q
,
void
RunOp
(
const
std
::
shared_ptr
<
BlockingQueue
<
VarHandleBase
*>>
&
ready_var_q
,
details
::
OpHandleBase
*
op
);
private:
...
...
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
cf1944af
...
...
@@ -259,6 +259,15 @@ GraphPatternDetector::DetectPatterns() {
return
result
;
}
bool
GraphItemCMP
(
const
std
::
pair
<
PDNode
*
,
Node
*>
&
a
,
const
std
::
pair
<
PDNode
*
,
Node
*>
&
b
)
{
if
(
a
.
first
!=
b
.
first
)
{
return
a
.
first
<
b
.
first
;
}
else
{
return
a
.
second
<
b
.
second
;
}
}
// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
// see https://github.com/PaddlePaddle/Paddle/issues/13550
void
GraphPatternDetector
::
UniquePatterns
(
...
...
@@ -267,12 +276,16 @@ void GraphPatternDetector::UniquePatterns(
std
::
vector
<
GraphPatternDetector
::
subgraph_t
>
result
;
std
::
unordered_set
<
size_t
>
set
;
std
::
hash
<
std
::
string
>
hasher
;
for
(
auto
&
g
:
*
subgraphs
)
{
size_t
key
=
0
;
for
(
auto
&
item
:
g
)
{
key
^=
std
::
hash
<
void
*>
{}(
item
.
first
);
key
^=
std
::
hash
<
void
*>
{}(
item
.
second
);
}
// Sort the items in the sub-graph, and transform to a string key.
std
::
vector
<
std
::
pair
<
PDNode
*
,
Node
*>>
sorted_keys
(
g
.
begin
(),
g
.
end
());
std
::
sort
(
sorted_keys
.
begin
(),
sorted_keys
.
end
(),
GraphItemCMP
);
std
::
stringstream
ss
;
for
(
auto
&
item
:
sorted_keys
)
{
ss
<<
item
.
first
<<
":"
<<
item
.
second
;
}
auto
key
=
hasher
(
ss
.
str
());
if
(
!
set
.
count
(
key
))
{
result
.
emplace_back
(
g
);
set
.
insert
(
key
);
...
...
paddle/fluid/framework/lod_tensor.cc
浏览文件 @
cf1944af
...
...
@@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor(
PADDLE_ENFORCE_EQ
(
new_lod
.
size
(),
lod
.
size
());
for
(
size_t
j
=
0
;
j
<
lod
.
size
();
++
j
)
{
auto
&
sub_lod
=
new_lod
[
j
];
auto
&
offset
=
sub_lod
.
back
();
size_t
offset
=
sub_lod
.
back
();
for
(
size_t
k
=
1
;
k
<
lod
[
j
].
size
();
++
k
)
{
sub_lod
.
push_back
(
lod
[
j
][
k
]
+
offset
);
}
...
...
paddle/fluid/framework/lod_tensor_array.h
浏览文件 @
cf1944af
...
...
@@ -19,81 +19,7 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
// NOTE The vector<LoDTensor> can't be replaced with the class LoDTensorArray
// directly, because there are many vector<LoDTensor> used accross the project,
// and some of them are treated as LoDTensorArray.
#if !defined(PADDLE_ON_INFERENCE)
using
LoDTensorArray
=
std
::
vector
<
LoDTensor
>
;
#else // !PADDLE_ON_INFERENCE
#pragma message "LoDTensorArray is replaced with the inference one."
/*
* A LoDTensorArray which will not deallocate buffer when resized, fix the data
* diff in inference, and more performance friendly in the concurrency
* scenerios.
*/
class
LoDTensorArray
{
public:
LoDTensorArray
()
=
default
;
using
iterator
=
std
::
vector
<
LoDTensor
>::
iterator
;
using
const_iterator
=
std
::
vector
<
LoDTensor
>::
const_iterator
;
const_iterator
begin
()
const
{
return
array_
.
begin
();
}
const_iterator
end
()
const
{
return
array_
.
begin
()
+
size_
;
}
iterator
begin
()
{
return
array_
.
begin
();
}
iterator
end
()
{
return
array_
.
begin
()
+
size_
;
}
void
push_back
(
const
LoDTensor
&
x
)
{
if
(
size_
<
array_
.
size
())
{
array_
[
size_
++
]
=
x
;
}
else
{
array_
.
push_back
(
x
);
++
size_
;
}
}
void
resize
(
size_t
size
)
{
if
(
array_
.
size
()
<
size
)
{
array_
.
resize
(
size
);
}
size_
=
size
;
}
void
emplace_back
()
{
array_
.
emplace_back
();
}
void
emplace_back
(
LoDTensor
&&
x
)
{
array_
.
emplace_back
(
std
::
move
(
x
));
}
LoDTensor
&
back
()
{
return
array_
.
back
();
}
size_t
space
()
const
{
return
array_
.
size
();
}
void
reserve
(
size_t
size
)
{
// Naive warning to tell user this array might be to large. The memory and
// buffer used by this TensorArray will not be deleted during the training
// and inference phase, so attention not to make it expand too long.
if
(
size
>
800UL
)
{
LOG
(
WARNING
)
<<
"TensorArray has more than 800 items"
;
}
array_
.
reserve
(
size
);
}
bool
empty
()
const
{
return
size_
==
0UL
;
}
void
clear
()
{
size_
=
0UL
;
}
LoDTensor
&
operator
[](
size_t
id
)
{
return
array_
[
id
];
}
const
LoDTensor
&
operator
[](
size_t
id
)
const
{
return
array_
[
id
];
}
LoDTensor
&
at
(
size_t
id
)
{
return
array_
.
at
(
id
);
}
const
LoDTensor
&
at
(
size_t
id
)
const
{
return
array_
.
at
(
id
);
}
size_t
size
()
const
{
return
size_
;
}
private:
size_t
size_
{
0
};
std
::
vector
<
LoDTensor
>
array_
;
};
#endif // !PADDLE_ON_INFERENCE
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/operator.cc
浏览文件 @
cf1944af
...
...
@@ -354,18 +354,18 @@ void OperatorBase::GenerateTemporaryNames() {
}
}
static
bool
VarIsTensor
(
const
Variable
*
var
)
{
return
var
->
IsType
<
LoDTensor
>
()
||
var
->
IsType
<
SelectedRows
>
();
static
bool
VarIsTensor
(
const
Variable
&
var
)
{
return
var
.
IsType
<
LoDTensor
>
()
||
var
.
IsType
<
SelectedRows
>
();
}
static
const
Tensor
*
GetTensorFromVar
(
Variable
*
var
)
{
if
(
var
->
IsType
<
LoDTensor
>
())
{
return
var
->
GetMutable
<
LoDTensor
>
(
);
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
return
var
->
GetMutable
<
SelectedRows
>
()
->
mutable_value
(
);
const
Tensor
*
GetTensorFromVar
(
const
Variable
&
var
)
{
if
(
var
.
IsType
<
LoDTensor
>
())
{
return
static_cast
<
const
Tensor
*>
(
&
(
var
.
Get
<
LoDTensor
>
())
);
}
else
if
(
var
.
IsType
<
SelectedRows
>
())
{
return
&
(
var
.
Get
<
SelectedRows
>
().
value
()
);
}
else
{
PADDLE_THROW
(
"Variable type_id %s, expect LoDTensor/SelectedRows."
,
var
->
Type
().
name
());
var
.
Type
().
name
());
}
}
...
...
@@ -415,8 +415,7 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
template
<
>
const
Tensor
*
ExecutionContext
::
Input
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
auto
*
var
=
InputVar
(
name
);
return
var
==
nullptr
?
nullptr
:
GetTensorFromVar
(
const_cast
<
Variable
*>
(
var
));
return
var
==
nullptr
?
nullptr
:
GetTensorFromVar
(
*
var
);
}
template
<
>
...
...
@@ -428,7 +427,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
std
::
transform
(
names
.
begin
(),
names
.
end
(),
std
::
back_inserter
(
res
),
[
&
](
const
std
::
string
&
sub_name
)
{
auto
var
=
scope_
.
FindVar
(
sub_name
);
return
var
==
nullptr
?
nullptr
:
GetTensorFromVar
(
var
);
return
var
==
nullptr
?
nullptr
:
GetTensorFromVar
(
*
var
);
});
return
res
;
}
...
...
@@ -770,8 +769,10 @@ void OperatorWithKernel::TransferInplaceVarsBack(
for
(
auto
&
var_name
:
inplace_vars
)
{
VLOG
(
3
)
<<
"share inplace var "
+
var_name
+
" back to it's original scope"
;
auto
*
original_tensor
=
GetMutableTensorFromVar
(
scope
.
FindVar
(
var_name
));
auto
*
transformed_tensor
=
GetTensorFromVar
(
transfer_scope
.
FindVar
(
var_name
));
auto
*
var
=
transfer_scope
.
FindVar
(
var_name
);
PADDLE_ENFORCE
(
var
!=
nullptr
,
"The var[%s] should not be nullptr"
,
var_name
);
auto
*
transformed_tensor
=
GetTensorFromVar
(
*
var
);
original_tensor
->
ShareDataWith
(
*
transformed_tensor
);
}
}
...
...
@@ -784,11 +785,11 @@ Scope* OperatorWithKernel::TryTransferData(
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
auto
*
var
=
scope
.
FindVar
(
var_name
);
// Only tensor can be tranfer to another device.
if
(
var
==
nullptr
||
!
VarIsTensor
(
var
))
{
if
(
var
==
nullptr
||
!
VarIsTensor
(
*
var
))
{
continue
;
}
auto
*
tensor_in
=
GetTensorFromVar
(
var
);
auto
*
tensor_in
=
GetTensorFromVar
(
*
var
);
if
(
!
tensor_in
->
IsInitialized
())
{
continue
;
}
...
...
paddle/fluid/framework/operator.h
浏览文件 @
cf1944af
...
...
@@ -63,6 +63,7 @@ inline std::string GradVarName(const std::string& var_name) {
}
proto
::
VarType
::
Type
GetDataTypeOfVar
(
const
Variable
*
var
);
const
Tensor
*
GetTensorFromVar
(
const
Variable
&
var
);
class
OperatorBase
;
class
ExecutionContext
;
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
cf1944af
...
...
@@ -303,10 +303,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
}
ParallelExecutor
::~
ParallelExecutor
()
{
const
auto
dev_ctxs
=
platform
::
DeviceContextPool
::
Instance
().
GetAllDeviceContexts
();
for
(
auto
&
dev_ctx
:
dev_ctxs
)
{
dev_ctx
->
Wait
();
for
(
auto
&
p
:
member_
->
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
}
if
(
member_
->
own_local_scope_
)
{
...
...
paddle/fluid/framework/tensor_test.cc
浏览文件 @
cf1944af
...
...
@@ -75,6 +75,19 @@ TEST(Tensor, MutableData) {
platform
::
CPUPlace
());
EXPECT_EQ
(
p1
,
p2
);
}
// Not sure if it's desired, but currently, Tensor type can be changed.
{
framework
::
Tensor
src_tensor
;
int8_t
*
p1
=
src_tensor
.
mutable_data
<
int8_t
>
(
framework
::
make_ddim
({
1
}),
platform
::
CPUPlace
());
EXPECT_NE
(
p1
,
nullptr
);
*
p1
=
1
;
uint8_t
*
p2
=
src_tensor
.
mutable_data
<
uint8_t
>
(
framework
::
make_ddim
({
1
}),
platform
::
CPUPlace
());
EXPECT_NE
(
p2
,
nullptr
);
EXPECT_EQ
(
static_cast
<
int
>
(
p2
[
0
]),
1
);
}
#ifdef PADDLE_WITH_CUDA
{
...
...
paddle/fluid/inference/CMakeLists.txt
浏览文件 @
cf1944af
if
(
WITH_TESTING
)
include
(
test.cmake
)
# some generic cmake funtion for inference
endif
()
# analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library.
add_subdirectory
(
analysis
)
...
...
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
cf1944af
...
...
@@ -20,22 +20,17 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis)
cc_test
(
test_dot SRCS dot_tester.cc DEPS analysis
)
cc_binary
(
inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid
)
function
(
inference_analysis_test TARGET
)
function
(
inference_analysis_test TARGET
)
if
(
WITH_TESTING
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS ARGS EXTRA_DEPS
)
cmake_parse_arguments
(
analysis_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
mem_opt
""
)
if
(
WITH_GPU
)
set
(
mem_opt
"--fraction_of_gpu_memory_to_use=0.5"
)
endif
()
cc_test
(
${
TARGET
}
SRCS
"
${
analysis_test_SRCS
}
"
inference_base_test
(
${
TARGET
}
SRCS
${
analysis_test_SRCS
}
DEPS analysis pass
${
GLOB_PASS_LIB
}
${
analysis_test_EXTRA_DEPS
}
ARGS --inference_model_dir=
${
PYTHON_TESTS_DIR
}
/book/word2vec.inference.model
${
mem_opt
}
${
analysis_test_ARGS
}
)
set_tests_properties
(
${
TARGET
}
PROPERTIES DEPENDS test_word2vec
)
endif
(
WITH_TESTING
)
ARGS --inference_model_dir=
${
WORD2VEC_MODEL_DIR
}
${
analysis_test_ARGS
}
)
endif
()
endfunction
(
inference_analysis_test
)
inference_analysis_test
(
test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api
)
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
cf1944af
...
...
@@ -17,39 +17,12 @@ if(APPLE)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-Wno-error=pessimizing-move"
)
endif
(
APPLE
)
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor
${
GLOB_PASS_LIB
}
)
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor
${
GLOB_PASS_LIB
}
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
inference_deps
${
inference_deps
}
paddle_inference_tensorrt_subgraph_engine analysis_predictor
)
endif
()
function
(
inference_api_test TARGET_NAME
)
if
(
WITH_TESTING
)
set
(
options
""
)
set
(
oneValueArgs SRC
)
set
(
multiValueArgs ARGS
)
cmake_parse_arguments
(
inference_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
if
(
WITH_GPU
)
cc_test
(
${
TARGET_NAME
}
SRCS
${
inference_test_SRC
}
DEPS
"
${
inference_deps
}
"
ARGS --dirname=
${
PYTHON_TESTS_DIR
}
/book/ --fraction_of_gpu_memory_to_use=0.15
)
else
()
cc_test
(
${
TARGET_NAME
}
SRCS
${
inference_test_SRC
}
DEPS
"
${
inference_deps
}
"
ARGS --dirname=
${
PYTHON_TESTS_DIR
}
/book/
)
endif
()
if
(
inference_test_ARGS
)
set_tests_properties
(
${
TARGET_NAME
}
PROPERTIES DEPENDS
"
${
inference_test_ARGS
}
"
)
endif
()
endif
(
WITH_TESTING
)
endfunction
(
inference_api_test
)
cc_library
(
reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor
)
...
...
@@ -59,10 +32,11 @@ cc_test(test_paddle_inference_api
SRCS api_tester.cc
DEPS paddle_inference_api
)
inference_api_test
(
test_api_impl SRC api_impl_tester.cc
ARGS test_word2vec test_image_classification
)
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests
)
if
(
WITH_TESTING
)
inference_base_test
(
test_api_impl SRCS api_impl_tester.cc DEPS
${
inference_deps
}
ARGS --word2vec_dirname=
${
WORD2VEC_MODEL_DIR
}
--book_dirname=
${
PYTHON_TESTS_DIR
}
/book
)
set_tests_properties
(
test_api_impl PROPERTIES DEPENDS test_image_classification
)
endif
()
cc_test
(
test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor
${
inference_deps
}
paddle_inference_api
ARGS --dirname=
${
PYTHON_TESTS_DIR
}
/book
)
...
...
@@ -70,8 +44,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
cc_library
(
paddle_inference_tensorrt_subgraph_engine
SRCS api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy
)
inference_api_test
(
test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec
)
if
(
WITH_TESTING
)
inference_base_test
(
test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS
${
inference_deps
}
ARGS --dirname=
${
WORD2VEC_MODEL_DIR
}
)
endif
()
endif
()
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
...
...
paddle/fluid/inference/api/api_impl_tester.cc
浏览文件 @
cf1944af
...
...
@@ -27,7 +27,9 @@ limitations under the License. */
#define ACC_DIFF 1e-3
#endif
DEFINE_string
(
dirname
,
""
,
"Directory of the inference model."
);
DEFINE_string
(
word2vec_dirname
,
""
,
"Directory of the word2vec inference model."
);
DEFINE_string
(
book_dirname
,
""
,
"Directory of the book inference model."
);
namespace
paddle
{
...
...
@@ -49,7 +51,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
NativeConfig
GetConfig
()
{
NativeConfig
config
;
config
.
model_dir
=
FLAGS_
dirname
+
"/word2vec.inference.model"
;
config
.
model_dir
=
FLAGS_
word2vec_dirname
;
LOG
(
INFO
)
<<
"dirname "
<<
config
.
model_dir
;
config
.
fraction_of_gpu_memory
=
0.15
;
#ifdef PADDLE_WITH_CUDA
...
...
@@ -116,7 +118,7 @@ void MainImageClassification(bool use_gpu) {
NativeConfig
config
=
GetConfig
();
config
.
use_gpu
=
use_gpu
;
config
.
model_dir
=
FLAGS_dirname
+
"/image_classification_resnet.inference.model"
;
FLAGS_
book_
dirname
+
"/image_classification_resnet.inference.model"
;
const
bool
is_combined
=
false
;
std
::
vector
<
std
::
vector
<
int64_t
>>
feed_target_shapes
=
...
...
@@ -187,7 +189,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
std
::
vector
<
std
::
thread
>
threads
;
for
(
int
tid
=
0
;
tid
<
num_jobs
;
++
tid
)
{
threads
.
emplace_back
([
&
,
tid
]()
{
auto
predictor
=
main_predictor
->
Clone
(
);
auto
predictor
=
CreatePaddlePredictor
(
config
);
auto
&
local_inputs
=
paddle_tensor_feeds
[
tid
];
std
::
vector
<
PaddleTensor
>
local_outputs
;
ASSERT_TRUE
(
predictor
->
Run
(
local_inputs
,
&
local_outputs
));
...
...
@@ -220,7 +222,7 @@ void MainThreadsImageClassification(bool use_gpu) {
NativeConfig
config
=
GetConfig
();
config
.
use_gpu
=
use_gpu
;
config
.
model_dir
=
FLAGS_dirname
+
"/image_classification_resnet.inference.model"
;
FLAGS_
book_
dirname
+
"/image_classification_resnet.inference.model"
;
auto
main_predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
std
::
vector
<
framework
::
LoDTensor
>
jobs
(
num_jobs
);
...
...
@@ -245,7 +247,7 @@ void MainThreadsImageClassification(bool use_gpu) {
std
::
vector
<
std
::
thread
>
threads
;
for
(
int
tid
=
0
;
tid
<
num_jobs
;
++
tid
)
{
threads
.
emplace_back
([
&
,
tid
]()
{
auto
predictor
=
main_predictor
->
Clone
(
);
auto
predictor
=
CreatePaddlePredictor
(
config
);
auto
&
local_inputs
=
paddle_tensor_feeds
[
tid
];
std
::
vector
<
PaddleTensor
>
local_outputs
;
ASSERT_TRUE
(
predictor
->
Run
(
local_inputs
,
&
local_outputs
));
...
...
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
浏览文件 @
cf1944af
...
...
@@ -29,13 +29,13 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
//# 1. Create PaddlePredictor with a config.
NativeConfig
config0
;
config0
.
model_dir
=
FLAGS_dirname
+
"word2vec.inference.model"
;
config0
.
model_dir
=
FLAGS_dirname
;
config0
.
use_gpu
=
true
;
config0
.
fraction_of_gpu_memory
=
0.3
;
config0
.
device
=
0
;
MixedRTConfig
config1
;
config1
.
model_dir
=
FLAGS_dirname
+
"word2vec.inference.model"
;
config1
.
model_dir
=
FLAGS_dirname
;
config1
.
use_gpu
=
true
;
config1
.
fraction_of_gpu_memory
=
0.3
;
config1
.
device
=
0
;
...
...
paddle/fluid/inference/api/demo_ci/run.sh
浏览文件 @
cf1944af
...
...
@@ -62,7 +62,7 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_GPU
=
$TEST_GPU_CPU
\
-DWITH_STATIC_LIB
=
$WITH_STATIC_LIB
make
-j
word2vec_model
=
$
{
PADDLE_ROOT
}
'/build/python/paddle/fluid/tests/book
/word2vec.inference.model'
word2vec_model
=
$
DATA_DIR
'/word2vec
/word2vec.inference.model'
if
[
-d
$word2vec_model
]
;
then
for
use_gpu
in
$use_gpu_list
;
do
./simple_on_word2vec
\
...
...
paddle/fluid/inference/test.cmake
0 → 100644
浏览文件 @
cf1944af
set
(
INFERENCE_URL
"http://paddle-inference-dist.cdn.bcebos.com"
CACHE STRING
"inference download url"
)
set
(
INFERENCE_DEMO_INSTALL_DIR
"
${
THIRD_PARTY_PATH
}
/inference_demo"
CACHE STRING
"A path setting inference demo download directories."
)
function
(
inference_download install_dir url filename
)
message
(
STATUS
"Download inference test stuff from
${
url
}
/
${
filename
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
install_dir
}
"
)
execute_process
(
COMMAND bash -c
"cd
${
install_dir
}
&& wget -q
${
url
}
/
${
filename
}
"
)
message
(
STATUS
"finish downloading
${
filename
}
"
)
endfunction
()
function
(
inference_download_and_uncompress install_dir url filename
)
inference_download
(
${
install_dir
}
${
url
}
${
filename
}
)
execute_process
(
COMMAND bash -c
"cd
${
install_dir
}
&& tar xzf
${
filename
}
"
)
endfunction
()
set
(
WORD2VEC_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/word2vec"
)
if
(
NOT EXISTS
${
WORD2VEC_INSTALL_DIR
}
)
inference_download_and_uncompress
(
${
WORD2VEC_INSTALL_DIR
}
${
INFERENCE_URL
}
"word2vec.inference.model.tar.gz"
)
endif
()
set
(
WORD2VEC_MODEL_DIR
"
${
WORD2VEC_INSTALL_DIR
}
/word2vec.inference.model"
)
function
(
inference_base_test TARGET
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS ARGS DEPS
)
cmake_parse_arguments
(
base_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
if
(
WITH_GPU
)
set
(
mem_opt
"--fraction_of_gpu_memory_to_use=0.5"
)
endif
()
cc_test
(
${
TARGET
}
SRCS
${
base_test_SRCS
}
DEPS
${
base_test_DEPS
}
ARGS
${
mem_opt
}
${
base_test_ARGS
}
)
endfunction
()
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
cf1944af
set
(
INFERENCE_URL
"http://paddle-inference-dist.cdn.bcebos.com"
)
set
(
INFERENCE_DEMO_INSTALL_DIR
"
${
THIRD_PARTY_PATH
}
/inference_demo"
CACHE STRING
"A path setting inference demo download directories."
)
set
(
INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
)
function
(
inference_download install_dir url filename
)
message
(
STATUS
"Download inference test stuff from
${
url
}
/
${
filename
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
install_dir
}
"
)
execute_process
(
COMMAND bash -c
"cd
${
install_dir
}
&& wget -q
${
url
}
/
${
filename
}
"
)
message
(
STATUS
"finish downloading
${
filename
}
"
)
endfunction
()
function
(
inference_download_and_uncompress install_dir url filename
)
inference_download
(
${
install_dir
}
${
url
}
${
filename
}
)
execute_process
(
COMMAND bash -c
"cd
${
install_dir
}
&& tar xzf
${
filename
}
"
)
endfunction
()
function
(
download_model_and_data install_dir model_name data_name
)
if
(
NOT EXISTS
${
install_dir
}
)
...
...
paddle/fluid/operators/add_position_encoding_op.cc
0 → 100644
浏览文件 @
cf1944af
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/add_position_encoding_op.h"
namespace
paddle
{
namespace
operators
{
class
AddPositionEncodingOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"X(Input) of add_position_encoding_op should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Out(Output) of add_position_encoding_op should not be null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
ctx
->
SetOutputDim
(
"Out"
,
x_dims
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
};
class
AddPositionEncodingOpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"X(Input) must not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Out"
),
"Out must not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Out@GRAD must not be null."
);
auto
out_dims
=
ctx
->
GetInputDim
(
"Out"
);
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
out_dims
);
}
}
};
class
AddPositionEncodingOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"Input of AddPositionEncoding operator"
);
AddOutput
(
"Out"
,
"Output of AddPositionEncoding operator"
);
AddAttr
<
float
>
(
"alpha"
,
"The scale of Original Embedding."
)
.
SetDefault
(
1.0
f
)
.
AddCustomChecker
([](
const
float
&
alpha
)
{
PADDLE_ENFORCE
(
alpha
>=
0.0
f
,
"'alpha' must be above 0.0."
);
});
AddAttr
<
float
>
(
"beta"
,
"The scale of Position Embedding."
)
.
SetDefault
(
1.0
f
)
.
AddCustomChecker
([](
const
float
&
beta
)
{
PADDLE_ENFORCE
(
beta
>=
0.0
f
,
"'beta' must be between 0.0."
);
});
AddComment
(
R"DOC(
Add Position Encoding Operator.
The add position encoding calculates the output based on the input, alpha, beta.
The size of each dimension of the parameters checked in the infer-shape.
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plt
=
paddle
::
platform
;
REGISTER_OPERATOR
(
add_position_encoding
,
ops
::
AddPositionEncodingOp
,
ops
::
AddPositionEncodingOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
add_position_encoding_grad
,
ops
::
AddPositionEncodingOpGrad
);
REGISTER_OP_CPU_KERNEL
(
add_position_encoding
,
ops
::
AddPositionEncodingKernel
<
plt
::
CPUDeviceContext
,
float
>
,
ops
::
AddPositionEncodingKernel
<
plt
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
add_position_encoding_grad
,
ops
::
AddPositionEncodingGradKernel
<
plt
::
CPUDeviceContext
,
float
>
,
ops
::
AddPositionEncodingGradKernel
<
plt
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/add_position_encoding_op.h
0 → 100644
浏览文件 @
cf1944af
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
AddPositionEncodingKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
X
=
context
.
Input
<
framework
::
LoDTensor
>
(
"X"
);
auto
&
x_lod
=
X
->
lod
();
auto
*
src_ptr
=
X
->
data
<
T
>
();
auto
*
Out
=
context
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
auto
*
dst_ptr
=
Out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
float
alpha
=
context
.
Attr
<
float
>
(
"alpha"
);
float
beta
=
context
.
Attr
<
float
>
(
"beta"
);
auto
x_dim
=
X
->
dims
();
int
batch_size
=
0
;
int
max_seq_len
=
0
;
int
enc_size
=
0
;
if
(
x_lod
.
empty
())
{
PADDLE_ENFORCE
(
x_dim
.
size
()
==
3UL
,
"The input X of Add Position Encoding should be 3-D Tensor!"
);
batch_size
=
x_dim
[
0
];
max_seq_len
=
x_dim
[
1
];
enc_size
=
x_dim
[
2
];
}
else
{
PADDLE_ENFORCE
(
x_dim
.
size
()
==
2UL
,
"The input X of Add Position Encoding should be 2-D LoDTensor!"
);
PADDLE_ENFORCE
(
x_lod
.
size
()
==
1UL
,
"The Add Position Encoding Op only supports lod_level == 1!"
);
batch_size
=
x_lod
[
0
].
size
()
-
1
;
max_seq_len
=
-
1
;
enc_size
=
x_dim
[
1
];
}
PADDLE_ENFORCE
(
enc_size
%
2
==
0
,
"Only support even encode size!"
);
const
int
half_size
=
enc_size
/
2
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
const
int
max_length
=
x_lod
.
empty
()
?
max_seq_len
:
x_lod
[
0
][
i
+
1
]
-
x_lod
[
0
][
i
];
for
(
int
j
=
0
;
j
<
max_length
;
++
j
)
{
for
(
int
k
=
0
;
k
<
half_size
;
++
k
)
{
const
double
val
=
(
half_size
>
1
)
?
j
/
pow
(
10000.0
,
double
(
k
)
/
(
half_size
-
1
))
:
j
/
10000.0
;
dst_ptr
[
k
]
=
src_ptr
[
k
]
*
alpha
+
sin
(
val
)
*
beta
;
dst_ptr
[
half_size
+
k
]
=
src_ptr
[
half_size
+
k
]
*
alpha
+
cos
(
val
)
*
beta
;
}
src_ptr
+=
enc_size
;
dst_ptr
+=
enc_size
;
}
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
AddPositionEncodingGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
dOut
=
context
.
Input
<
framework
::
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
dout
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dOut
);
auto
*
dX
=
context
.
Output
<
framework
::
LoDTensor
>
(
framework
::
GradVarName
(
"X"
));
dX
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
dx
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dX
);
float
alpha
=
context
.
Attr
<
float
>
(
"alpha"
);
auto
*
place
=
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
dx
.
device
(
*
place
)
=
dout
*
static_cast
<
T
>
(
alpha
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/gather_op.cc
浏览文件 @
cf1944af
...
...
@@ -102,7 +102,9 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
gather_grad
,
ops
::
GatherGradOp
);
REGISTER_OP_CPU_KERNEL
(
gather
,
ops
::
GatherOpKernel
<
float
>
,
ops
::
GatherOpKernel
<
int
>
,
ops
::
GatherOpKernel
<
double
>
);
ops
::
GatherOpKernel
<
double
>
,
ops
::
GatherOpKernel
<
int
>
,
ops
::
GatherOpKernel
<
int64_t
>
);
REGISTER_OP_CPU_KERNEL
(
gather_grad
,
ops
::
GatherGradientOpKernel
<
float
>
,
ops
::
GatherGradientOpKernel
<
double
>
,
ops
::
GatherGradientOpKernel
<
int
>
,
ops
::
GatherGradientOpKernel
<
double
>
);
ops
::
GatherGradientOpKernel
<
int64_t
>
);
paddle/fluid/operators/gather_op.cu
浏览文件 @
cf1944af
...
...
@@ -61,5 +61,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
gather
,
ops
::
GatherOpCUDAKernel
<
float
>
);
REGISTER_OP_CUDA_KERNEL
(
gather_grad
,
ops
::
GatherGradOpCUDAKernel
<
float
>
);
REGISTER_OP_CUDA_KERNEL
(
gather
,
ops
::
GatherOpCUDAKernel
<
float
>
,
ops
::
GatherOpCUDAKernel
<
double
>
,
ops
::
GatherOpCUDAKernel
<
int64_t
>
,
ops
::
GatherOpCUDAKernel
<
int
>
);
REGISTER_OP_CUDA_KERNEL
(
gather_grad
,
ops
::
GatherGradOpCUDAKernel
<
float
>
,
ops
::
GatherGradOpCUDAKernel
<
double
>
,
ops
::
GatherGradOpCUDAKernel
<
int64_t
>
,
ops
::
GatherGradOpCUDAKernel
<
int
>
);
paddle/fluid/operators/math/sequence_pooling.cc
浏览文件 @
cf1944af
...
...
@@ -31,7 +31,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
>
template
<
typename
T
,
bool
is_test
>
class
MaxSeqPoolFunctor
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
...
...
@@ -70,7 +70,41 @@ class MaxSeqPoolFunctor {
}
}
};
// Instantisation of Max Sequence Pooling for test phase eg. no need to fill
// index buffer
template
<
typename
T
>
class
MaxSeqPoolFunctor
<
T
,
true
>
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
LoDTensor
&
input
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
index
)
{
auto
in_dims
=
input
.
dims
();
auto
out_dims
=
output
->
dims
();
PADDLE_ENFORCE_GT
(
in_dims
.
size
(),
1
);
PADDLE_ENFORCE_GT
(
out_dims
.
size
(),
1
);
for
(
int64_t
i
=
1
;
i
<
in_dims
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
in_dims
[
i
],
out_dims
[
i
]);
}
auto
starts
=
input
.
lod
()[
0
];
const
T
*
in_data
=
input
.
data
<
T
>
();
T
*
out_data
=
output
->
data
<
T
>
();
int64_t
num_seq
=
out_dims
[
0
];
int64_t
dim
=
output
->
numel
()
/
num_seq
;
for
(
int64_t
i
=
0
;
i
<
num_seq
;
++
i
)
{
std
::
memcpy
(
&
out_data
[
i
*
dim
],
&
in_data
[
starts
[
i
]
*
dim
],
dim
*
sizeof
(
T
));
for
(
size_t
j
=
starts
[
i
]
+
1
;
j
<
starts
[
i
+
1
];
++
j
)
{
for
(
int64_t
k
=
0
;
k
<
dim
;
++
k
)
{
if
(
in_data
[
j
*
dim
+
k
]
>
out_data
[
i
*
dim
+
k
])
{
out_data
[
i
*
dim
+
k
]
=
in_data
[
j
*
dim
+
k
];
}
}
}
}
}
};
template
<
typename
T
>
class
MaxSeqPoolGradFunctor
{
public:
...
...
@@ -188,11 +222,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
/* max pool has index output */
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
std
::
string
pooltype
,
const
framework
::
LoDTensor
&
input
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
output
,
bool
is_test
,
framework
::
Tensor
*
index
=
nullptr
)
{
if
(
pooltype
==
"MAX"
)
{
math
::
MaxSeqPoolFunctor
<
T
>
max_pool
;
if
(
is_test
)
{
math
::
MaxSeqPoolFunctor
<
T
,
true
>
max_pool
;
max_pool
(
context
,
input
,
output
,
index
);
}
else
{
math
::
MaxSeqPoolFunctor
<
T
,
false
>
max_pool
;
max_pool
(
context
,
input
,
output
,
index
);
}
return
;
}
if
(
pooltype
==
"LAST"
)
{
...
...
@@ -200,6 +239,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
last_pool
(
context
,
input
,
output
);
return
;
}
if
(
pooltype
==
"FIRST"
)
{
math
::
FirstSeqPoolFunctor
<
T
>
first_pool
;
first_pool
(
context
,
input
,
output
);
...
...
paddle/fluid/operators/math/sequence_pooling.cu
浏览文件 @
cf1944af
...
...
@@ -133,7 +133,7 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
std
::
string
pooltype
,
const
framework
::
LoDTensor
&
input
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
output
,
bool
is_test
,
framework
::
Tensor
*
index
=
nullptr
)
{
auto
&
lod
=
input
.
lod
()[
0
];
const
size_t
item_dim
=
output
->
numel
()
/
output
->
dims
()[
0
];
...
...
paddle/fluid/operators/math/sequence_pooling.h
浏览文件 @
cf1944af
...
...
@@ -28,7 +28,7 @@ class SequencePoolFunctor {
/* max pool has index output */
void
operator
()(
const
DeviceContext
&
context
,
const
std
::
string
pooltype
,
const
framework
::
LoDTensor
&
input
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
index
=
nullptr
);
bool
is_test
=
false
,
framework
::
Tensor
*
index
=
nullptr
);
};
template
<
typename
DeviceContext
,
typename
T
>
...
...
paddle/fluid/operators/sequence_pool_op.cc
浏览文件 @
cf1944af
...
...
@@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor<int>) This tensor is used for the sequence max-pooling "
"to record the max indexes."
)
.
AsIntermediate
();
AddAttr
<
bool
>
(
"is_test"
,
""
).
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
"pooltype"
,
"(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp."
)
...
...
paddle/fluid/operators/sequence_pool_op.h
浏览文件 @
cf1944af
...
...
@@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
auto
*
in
=
context
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
std
::
string
pooltype
=
context
.
Attr
<
std
::
string
>
(
"pooltype"
);
Tensor
*
index
=
nullptr
;
if
(
pooltype
==
"MAX"
)
{
index
=
context
.
Output
<
Tensor
>
(
"MaxIndex"
);
}
auto
dims
=
in
->
dims
();
auto
lod
=
in
->
lod
();
...
...
@@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel<T> {
dims
[
0
]
=
lod
[
0
].
size
()
-
1
;
out
->
Resize
({
dims
});
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
pooltype
==
"MAX"
)
{
Tensor
*
index
=
nullptr
;
const
bool
is_test
=
context
.
Attr
<
bool
>
(
"is_test"
);
// Do not create index buffer for inference (is_test) mode
// TODO(jczaja): Skip index buffer creation for other devices eg. GPU
if
(
pooltype
==
"MAX"
&&
(
is_test
==
false
||
platform
::
is_cpu_place
(
context
.
GetPlace
())
==
false
))
{
index
=
context
.
Output
<
Tensor
>
(
"MaxIndex"
);
index
->
Resize
({
dims
});
index
->
mutable_data
<
int
>
(
context
.
GetPlace
());
}
math
::
SequencePoolFunctor
<
DeviceContext
,
T
>
pool
;
pool
(
context
.
template
device_context
<
DeviceContext
>(),
pooltype
,
*
in
,
out
,
index
);
i
s_test
,
i
ndex
);
}
};
...
...
paddle/fluid/operators/sum_op.cc
浏览文件 @
cf1944af
...
...
@@ -67,6 +67,7 @@ class SumOp : public framework::OperatorWithKernel {
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
x_vars
=
ctx
.
MultiInputVar
(
"X"
);
auto
x_vars_name
=
ctx
.
Inputs
(
"X"
);
framework
::
LibraryType
library
{
framework
::
LibraryType
::
kPlain
};
framework
::
DataLayout
layout
{
framework
::
DataLayout
::
kAnyLayout
};
...
...
@@ -81,15 +82,18 @@ class SumOp : public framework::OperatorWithKernel {
if
(
x_vars
[
0
]
->
IsType
<
framework
::
LoDTensor
>
())
{
int
dtype
=
-
1
;
for
(
auto
&
x_var
:
x_vars
)
{
auto
&
lod_tensor
=
x_var
->
Get
<
framework
::
LoDTensor
>
();
if
(
lod_tensor
.
numel
()
==
0
)
{
for
(
size_t
idx
=
0
;
idx
<
x_vars
.
size
();
++
idx
)
{
PADDLE_ENFORCE
(
x_vars
[
idx
]
!=
nullptr
,
"Input var[%s] should not be nullptr"
,
x_vars_name
[
idx
]);
// FIXME(zcd): The input x_var may be SelectedRows or LoDTensor.
auto
tensor
=
framework
::
GetTensorFromVar
(
*
x_vars
[
idx
]);
if
(
tensor
->
numel
()
==
0
)
{
continue
;
}
if
(
dtype
==
-
1
)
{
dtype
=
framework
::
ToDataType
(
lod_tensor
.
type
());
dtype
=
framework
::
ToDataType
(
tensor
->
type
());
}
else
{
PADDLE_ENFORCE_EQ
(
dtype
,
framework
::
ToDataType
(
lod_tensor
.
type
()));
PADDLE_ENFORCE_EQ
(
dtype
,
framework
::
ToDataType
(
tensor
->
type
()));
}
}
PADDLE_ENFORCE_NE
(
dtype
,
-
1
,
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
cf1944af
...
...
@@ -32,23 +32,25 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
"'Place' is not supported, Please re-compile with WITH_GPU "
"option"
);
}
return
it
->
second
.
get
();
return
it
->
second
.
get
()
.
get
()
;
}
const
std
::
vector
<
const
DeviceContext
*>
DeviceContextPool
::
GetAllDeviceContexts
()
const
{
std
::
vector
<
const
DeviceContext
*>
all_device_ctx
;
all_device_ctx
.
reserve
(
device_contexts_
.
size
());
for
(
auto
&
dev_ctx
:
device_contexts_
)
{
all_device_ctx
.
emplace_back
(
dev_ctx
.
second
.
get
());
}
return
all_device_ctx
;
template
<
typename
DevCtx
,
typename
PlaceType
>
inline
void
EmplaceDeviceContext
(
std
::
map
<
Place
,
std
::
shared_future
<
std
::
unique_ptr
<
DeviceContext
>>>*
map_ptr
,
platform
::
Place
p
)
{
using
PtrType
=
std
::
unique_ptr
<
DeviceContext
>
;
map_ptr
->
emplace
(
p
,
std
::
async
(
std
::
launch
::
deferred
,
[
=
]
{
// lazy evaluation. i.e., only create device context at
// first `Get`
return
PtrType
(
new
DevCtx
(
boost
::
get
<
PlaceType
>
(
p
)));
}));
}
DeviceContextPool
::
DeviceContextPool
(
const
std
::
vector
<
platform
::
Place
>&
places
)
{
PADDLE_ENFORCE_GT
(
places
.
size
(),
0
);
using
PtrType
=
std
::
unique_ptr
<
DeviceContext
>
;
std
::
set
<
Place
>
set
;
for
(
auto
&
p
:
places
)
{
set
.
insert
(
p
);
...
...
@@ -57,16 +59,13 @@ DeviceContextPool::DeviceContextPool(
for
(
auto
&
p
:
set
)
{
if
(
platform
::
is_cpu_place
(
p
))
{
#ifdef PADDLE_WITH_MKLDNN
device_contexts_
.
emplace
(
p
,
PtrType
(
new
MKLDNNDeviceContext
(
boost
::
get
<
CPUPlace
>
(
p
))));
EmplaceDeviceContext
<
MKLDNNDeviceContext
,
CPUPlace
>
(
&
device_contexts_
,
p
);
#else
device_contexts_
.
emplace
(
p
,
PtrType
(
new
CPUDeviceContext
(
boost
::
get
<
CPUPlace
>
(
p
))));
EmplaceDeviceContext
<
CPUDeviceContext
,
CPUPlace
>
(
&
device_contexts_
,
p
);
#endif
}
else
if
(
platform
::
is_gpu_place
(
p
))
{
#ifdef PADDLE_WITH_CUDA
device_contexts_
.
emplace
(
p
,
PtrType
(
new
CUDADeviceContext
(
boost
::
get
<
CUDAPlace
>
(
p
))));
EmplaceDeviceContext
<
CUDADeviceContext
,
CUDAPlace
>
(
&
device_contexts_
,
p
);
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
...
...
@@ -74,9 +73,8 @@ DeviceContextPool::DeviceContextPool(
#endif
}
else
if
(
platform
::
is_cuda_pinned_place
(
p
))
{
#ifdef PADDLE_WITH_CUDA
device_contexts_
.
emplace
(
p
,
PtrType
(
new
CUDAPinnedDeviceContext
(
boost
::
get
<
CUDAPinnedPlace
>
(
p
))));
EmplaceDeviceContext
<
CUDAPinnedDeviceContext
,
CUDAPinnedPlace
>
(
&
device_contexts_
,
p
);
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
cf1944af
...
...
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <future> // NOLINT
#include <memory>
#include <mutex> // NOLINT
#include <string>
...
...
@@ -223,9 +224,6 @@ class DeviceContextPool {
/*! \brief Return handle of single device context. */
platform
::
DeviceContext
*
Get
(
const
platform
::
Place
&
place
);
/*! \brief Return all the device contexts. */
const
std
::
vector
<
const
DeviceContext
*>
GetAllDeviceContexts
()
const
;
template
<
typename
Place
>
const
typename
DefaultDeviceContextType
<
Place
>::
TYPE
*
GetByPlace
(
const
Place
&
place
)
{
...
...
@@ -237,7 +235,8 @@ class DeviceContextPool {
private:
static
DeviceContextPool
*
pool
;
std
::
map
<
Place
,
std
::
unique_ptr
<
DeviceContext
>>
device_contexts_
;
std
::
map
<
Place
,
std
::
shared_future
<
std
::
unique_ptr
<
DeviceContext
>>>
device_contexts_
;
DISABLE_COPY_AND_ASSIGN
(
DeviceContextPool
);
};
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
cf1944af
...
...
@@ -147,13 +147,11 @@ function cmake_gen() {
-DWITH_SWIG_PY=
${
WITH_SWIG_PY
:-
ON
}
-DCUDNN_ROOT=/usr/
-DWITH_TESTING=
${
WITH_TESTING
:-
ON
}
-DWITH_FAST_BUNDLE_TEST=ON
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_FLUID_ONLY=
${
WITH_FLUID_ONLY
:-
OFF
}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=
${
WITH_CONTRIB
:-
ON
}
-DWITH_INFERENCE=
${
WITH_INFERENCE
:-
ON
}
-DWITH_INFERENCE_API_TEST=
${
WITH_INFERENCE_API_TEST
:-
ON
}
-DINFERENCE_DEMO_INSTALL_DIR=
${
INFERENCE_DEMO_INSTALL_DIR
}
-DWITH_ANAKIN=
${
WITH_ANAKIN
:-
OFF
}
...
...
@@ -181,12 +179,10 @@ EOF
-DWITH_PYTHON
=
${
WITH_PYTHON
:-
ON
}
\
-DCUDNN_ROOT
=
/usr/
\
-DWITH_TESTING
=
${
WITH_TESTING
:-
ON
}
\
-DWITH_FAST_BUNDLE_TEST
=
ON
\
-DCMAKE_MODULE_PATH
=
/opt/rocm/hip/cmake
\
-DWITH_FLUID_ONLY
=
${
WITH_FLUID_ONLY
:-
OFF
}
\
-DCMAKE_EXPORT_COMPILE_COMMANDS
=
ON
\
-DWITH_CONTRIB
=
${
WITH_CONTRIB
:-
ON
}
\
-DWITH_INFERENCE
=
${
WITH_INFERENCE
:-
ON
}
\
-DWITH_INFERENCE_API_TEST
=
${
WITH_INFERENCE_API_TEST
:-
ON
}
\
-DINFERENCE_DEMO_INSTALL_DIR
=
${
INFERENCE_DEMO_INSTALL_DIR
}
\
-DWITH_ANAKIN
=
${
WITH_ANAKIN
:-
OFF
}
\
...
...
@@ -653,7 +649,7 @@ function gen_capi_package() {
function
gen_fluid_lib
()
{
mkdir
-p
${
PADDLE_ROOT
}
/build
cd
${
PADDLE_ROOT
}
/build
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
&&
${
WITH_INFERENCE
:-
ON
}
==
"ON"
]]
;
then
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
]]
;
then
cat
<<
EOF
========================================
Generating fluid library for train and inference ...
...
...
@@ -666,7 +662,7 @@ EOF
}
function
tar_fluid_lib
()
{
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
&&
${
WITH_INFERENCE
:-
ON
}
==
"ON"
]]
;
then
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
]]
;
then
cat
<<
EOF
========================================
Taring fluid library for train and inference ...
...
...
@@ -681,7 +677,7 @@ EOF
}
function
test_fluid_lib
()
{
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
&&
${
WITH_INFERENCE
:-
ON
}
==
"ON"
]]
;
then
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
]]
;
then
cat
<<
EOF
========================================
Testing fluid library for inference ...
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
cf1944af
...
...
@@ -157,6 +157,8 @@ __all__ = [
'sequence_reverse'
,
'affine_channel'
,
'hash'
,
'log_loss'
,
'add_position_encoding'
,
]
...
...
@@ -747,7 +749,7 @@ def dynamic_gru(input,
attr
=
helper
.
bias_attr
,
shape
=
[
1
,
3
*
size
],
dtype
=
dtype
,
is_bias
=
True
)
batch_size
=
input
.
shape
[
0
]
inputs
=
{
'Input'
:
input
,
'Weight'
:
weight
,
'Bias'
:
bias
}
if
h_0
!=
None
:
if
h_0
:
assert
h_0
.
shape
==
(
batch_size
,
size
),
'The shape of h0 should be(batch_size, %d)'
%
size
...
...
@@ -1823,7 +1825,7 @@ def conv3d(input,
return
helper
.
append_activation
(
pre_act
)
def
sequence_pool
(
input
,
pool_type
):
def
sequence_pool
(
input
,
pool_type
,
is_test
=
False
):
"""
This function add the operator for sequence pooling.
It pools features of all time-steps of each instance, and is applied
...
...
@@ -1860,6 +1862,7 @@ def sequence_pool(input, pool_type):
input(variable): The input variable which is a LoDTensor.
pool_type (string): The pooling type of sequence_pool.
It supports average, sum, sqrt and max.
is_test(bool, Default False): Used distinguish training from scoring mode.
Returns:
The sequence pooling variable which is a Tensor.
...
...
@@ -1887,7 +1890,8 @@ def sequence_pool(input, pool_type):
inputs
=
{
"X"
:
input
},
outputs
=
{
"Out"
:
pool_out
,
"MaxIndex"
:
max_index
},
attrs
=
{
"pooltype"
:
pool_type
.
upper
()})
attrs
=
{
"pooltype"
:
pool_type
.
upper
(),
"is_test"
:
is_test
})
# when pool_type is max, variable max_index is initialized,
# so we stop the gradient explicitly here
...
...
@@ -3016,7 +3020,8 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
x = fluid.layers.data(name='y', shape=[10, 5],
dtype='float32', lod_level=1)
pad_value = fluid.layers.assign(input=numpy.array([0]))
pad_value = fluid.layers.assign(
input=numpy.array([0], dtype=numpy.float32))
out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
"""
...
...
@@ -7580,3 +7585,99 @@ def hash(input, hash_size, num_hash=1, name=None):
attrs
=
{
'num_hash'
:
num_hash
,
'mod_by'
:
hash_size
})
return
out
def
log_loss
(
input
,
label
,
epsilon
=
1e-4
,
name
=
None
):
"""
**Negative Log Loss Layer**
This layer accepts input predictions and target label and returns the
negative log loss.
.. math::
Out = -label *
\\
log{(input +
\\
epsilon)}
- (1 - label) *
\\
log{(1 - input +
\\
epsilon)}
Args:
input (Variable|list): a 2-D tensor with shape [N x 1], where N is the
batch size. This input is a probability computed
by the previous operator.
label (Variable|list): the ground truth which is a 2-D tensor with
shape [N x 1], where N is the batch size.
epsilon (float): epsilon
name (string): the name of log_loss
Returns:
Variable: A 2-D tensor with shape [N x 1], the negative log loss.
Examples:
.. code-block:: python
prob = fluid.layers.sigmoid(net)
cost = fluid.layers.log_loss(input=prob, label=label)
"""
helper
=
LayerHelper
(
'log_loss'
,
**
locals
())
if
name
is
None
:
loss
=
helper
.
create_variable_for_type_inference
(
dtype
=
input
.
dtype
)
else
:
loss
=
helper
.
create_variable
(
name
=
name
,
dtype
=
input
.
dtype
,
persistable
=
False
)
helper
.
append_op
(
type
=
'log_loss'
,
inputs
=
{
'Predicted'
:
[
input
],
'Labels'
:
[
label
]},
outputs
=
{
'Loss'
:
[
loss
]},
attrs
=
{
'epsilon'
:
epsilon
})
return
loss
def
add_position_encoding
(
input
,
alpha
,
beta
,
name
=
None
):
"""
**Add Position Encoding Layer**
This layer accepts an input 3D-Tensor of shape [N x M x P], and return an
output Tensor of shape [N x M x P] with positional encoding value.
Refer to `Attention Is All You Need<http://arxiv.org/pdf/1706.03762.pdf>`_ .
.. math::
PE(pos, 2i) =
\\
sin{(pos / 10000^{2i / P})}
\\\\
PE(pos, 2i + 1) =
\\
cos{(pos / 10000^{2i / P})}
\\\\
Out(:, pos, i) =
\\
alpha * input(:, pos, i) +
\\
beta * PE(pos, i)
Where:
* PE(pos, 2i): the increment for the number at even position
* PE(pos, 2i + 1): the increment for the number at odd position
Args:
input (Variable): 3-D input tensor with shape [N x M x P]
alpha (float): multiple of Input Tensor
beta (float): multiple of Positional Encoding Tensor
name (string): the name of position encoding layer
Returns:
Variable: A 3-D Tensor of shape [N x M x P] with positional encoding.
Examples:
.. code-block:: python
position_tensor = fluid.layers.add_position_encoding(input=tensor)
"""
helper
=
LayerHelper
(
'add_position_encoding'
,
**
locals
())
dtype
=
helper
.
input_dtype
()
if
name
is
None
:
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
dtype
)
else
:
out
=
helper
.
create_variable
(
name
=
name
,
dtype
=
dtype
,
persistable
=
False
)
helper
.
append_op
(
type
=
"add_position_encoding"
,
inputs
=
{
"X"
:
input
},
outputs
=
{
"Out"
:
out
},
attrs
=
{
"alpha"
:
alpha
,
"beta"
:
beta
})
return
out
python/paddle/fluid/metrics.py
浏览文件 @
cf1944af
...
...
@@ -194,7 +194,7 @@ class CompositeMetric(MetricBase):
or soft-label, should custom the corresponding update rule.
"""
for
m
in
self
.
_metrics
:
ans
.
append
(
m
.
update
(
preds
,
labels
)
)
m
.
update
(
preds
,
labels
)
def
eval
(
self
):
"""
...
...
python/paddle/fluid/tests/CMakeLists.txt
浏览文件 @
cf1944af
set
(
PYTHON_TESTS_DIR
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/tests CACHE INTERNAL
"python tests directory"
)
file
(
GLOB TEST_OPS RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"test_*.py"
)
string
(
REPLACE
".py"
""
TEST_OPS
"
${
TEST_OPS
}
"
)
...
...
python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
浏览文件 @
cf1944af
file
(
GLOB TEST_OPS RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"test_*.py"
)
string
(
REPLACE
".py"
""
TEST_OPS
"
${
TEST_OPS
}
"
)
# default test
foreach
(
src
${
TEST_OPS
}
)
if
(
NOT APPLE
)
# default test
foreach
(
src
${
TEST_OPS
}
)
py_test
(
${
src
}
SRCS
${
src
}
.py
)
endforeach
()
endforeach
()
else
()
foreach
(
src
${
TEST_OPS
}
)
if
(
${
src
}
STREQUAL
"test_image_classification_vgg"
)
message
(
WARNING
"These tests has been disabled in OSX for random fail:
\n
"
${
src
}
)
elseif
(
${
src
}
STREQUAL
"test_image_classification_resnet"
)
message
(
WARNING
"These tests has been disabled in OSX for random fail:
\n
"
${
src
}
)
elseif
()
py_test
(
${
src
}
SRCS
${
src
}
.py
)
endif
()
endforeach
()
endif
()
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
cf1944af
...
...
@@ -17,6 +17,10 @@ if(NOT WITH_DISTRIBUTE)
list
(
REMOVE_ITEM TEST_OPS test_listen_and_serv_op
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_mnist
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_word2vec
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_ctr
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_simnet_bow
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_text_classification
)
endif
(
NOT WITH_DISTRIBUTE
)
list
(
REMOVE_ITEM TEST_OPS test_seq_concat_op
)
# FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
...
...
@@ -55,6 +59,7 @@ function(py_test_modules TARGET_NAME)
if
(
py_test_modules_SERIAL
)
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY RUN_SERIAL 1
)
endif
()
set_tests_properties
(
${
TARGET_NAME
}
PROPERTIES TIMEOUT 600
)
endif
()
endfunction
()
list
(
REMOVE_ITEM TEST_OPS test_warpctc_op
)
...
...
@@ -88,4 +93,6 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
py_test_modules
(
test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL
)
set_tests_properties
(
test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150
)
py_test_modules
(
test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL
)
py_test_modules
(
test_image_classification_resnet MODULES test_image_classification_resnet SERIAL
)
if
(
NOT APPLE
)
py_test_modules
(
test_image_classification_resnet MODULES test_image_classification_resnet SERIAL
)
endif
()
python/paddle/fluid/tests/unittests/dist_mnist.py
浏览文件 @
cf1944af
...
...
@@ -90,8 +90,10 @@ class TestDistMnist2x2(TestDistRunnerBase):
inference_program
=
fluid
.
default_main_program
().
clone
()
# Optimization
opt
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
0.001
,
beta1
=
0.9
,
beta2
=
0.999
)
# TODO(typhoonzero): fix distributed adam optimizer
# opt = fluid.optimizer.AdamOptimizer(
# learning_rate=0.001, beta1=0.9, beta2=0.999)
opt
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
0.001
,
momentum
=
0.9
)
# Reader
train_reader
=
paddle
.
batch
(
...
...
python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py
0 → 100644
浏览文件 @
cf1944af
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
import
math
import
paddle.fluid.core
as
core
from
op_test
import
OpTest
class
TestAddPositionEncodingTensorOp
(
OpTest
):
"""
This class is to test the AddPositionEncodingOp
"""
def
setUp
(
self
):
"""
the prepared section for add position encoding op
"""
self
.
op_type
=
"add_position_encoding"
self
.
dtype
=
np
.
float32
self
.
init_input_output
()
self
.
inputs
=
{
'X'
:
OpTest
.
np_dtype_to_fluid_dtype
(
self
.
x
),
}
self
.
outputs
=
{
'Out'
:
self
.
out
}
self
.
attrs
=
{
'alpha'
:
self
.
alpha
,
'beta'
:
self
.
beta
}
def
test_check_output
(
self
):
"""
check the correctness of output
"""
self
.
check_output
()
def
test_check_grad
(
self
):
"""
check the correctness of grad
"""
self
.
check_grad
([
'X'
],
'Out'
,
max_relative_error
=
0.005
)
def
init_input_output
(
self
):
"""
init the input and output for test cases
"""
self
.
alpha
=
0.6
self
.
beta
=
0.5
self
.
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
2
,
4
,
4
]).
astype
(
self
.
dtype
)
self
.
out
=
np
.
copy
(
self
.
x
)
batch_size
=
self
.
x
.
shape
[
0
]
max_length
=
self
.
x
.
shape
[
1
]
enc_size
=
self
.
x
.
shape
[
2
]
half_shape
=
int
(
enc_size
/
2
)
for
i
in
range
(
batch_size
):
for
j
in
range
(
max_length
):
for
k
in
range
(
half_shape
):
val
=
j
/
pow
(
10000.0
,
k
/
(
half_shape
-
1
))
if
half_shape
>
1
else
j
/
10000.0
self
.
out
[
i
,
j
,
k
]
=
\
self
.
x
[
i
,
j
,
k
]
*
self
.
alpha
+
math
.
sin
(
val
)
*
self
.
beta
self
.
out
[
i
,
j
,
half_shape
+
k
]
=
\
self
.
x
[
i
,
j
,
half_shape
+
k
]
*
self
.
alpha
+
math
.
cos
(
val
)
*
self
.
beta
class
TestAddPositionEncodingLoDTensorOp
(
OpTest
):
"""
This class is to test the AddPositionEncodingLoDTensorOp
"""
def
setUp
(
self
):
"""
the prepared section for add position encoding LoDTensor op
"""
self
.
op_type
=
"add_position_encoding"
self
.
dtype
=
np
.
float32
self
.
init_input_output
()
self
.
inputs
=
{
'X'
:
(
self
.
x
,
self
.
lod
),
}
self
.
outputs
=
{
'Out'
:
(
self
.
out
,
self
.
lod
)}
self
.
attrs
=
{
'alpha'
:
self
.
alpha
,
'beta'
:
self
.
beta
}
def
test_check_output
(
self
):
"""
check the correctness of output
"""
self
.
check_output
()
def
test_check_grad
(
self
):
"""
check the correctness of grad
"""
self
.
check_grad
([
'X'
],
'Out'
,
max_relative_error
=
0.005
)
def
init_input_output
(
self
):
"""
init the input and output for test cases
"""
self
.
alpha
=
0.6
self
.
beta
=
0.5
self
.
x
=
np
.
random
.
uniform
(
0.1
,
1
,
[
10
,
4
]).
astype
(
self
.
dtype
)
self
.
lod
=
[[
3
,
7
]]
self
.
out
=
np
.
copy
(
self
.
x
)
batch_size
=
len
(
self
.
lod
[
0
])
enc_size
=
self
.
x
.
shape
[
1
]
start
=
0
half_shape
=
int
(
enc_size
/
2
)
for
i
in
range
(
batch_size
):
max_length
=
self
.
lod
[
0
][
i
]
for
j
in
range
(
max_length
):
for
k
in
range
(
half_shape
):
val
=
j
/
pow
(
10000.0
,
k
/
(
half_shape
-
1
))
if
half_shape
>
1
else
j
/
10000.0
pos
=
start
+
j
self
.
out
[
pos
,
k
]
=
\
self
.
x
[
pos
,
k
]
*
self
.
alpha
+
math
.
sin
(
val
)
*
self
.
beta
self
.
out
[
pos
,
half_shape
+
k
]
=
\
self
.
x
[
pos
,
half_shape
+
k
]
*
self
.
alpha
+
math
.
cos
(
val
)
*
self
.
beta
start
+=
max_length
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
cf1944af
...
...
@@ -22,6 +22,8 @@ import signal
import
subprocess
import
six
import
argparse
import
pickle
import
numpy
as
np
import
paddle.fluid
as
fluid
...
...
@@ -128,10 +130,15 @@ class TestDistRunnerBase(object):
else
:
return
origin_batch
out_losses
=
[]
for
_
in
six
.
moves
.
xrange
(
RUN_STEP
):
loss
,
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
],
feed
=
feeder
.
feed
(
get_data
()))
print
(
loss
)
out_losses
.
append
(
loss
[
0
])
if
six
.
PY2
:
print
(
pickle
.
dumps
(
out_losses
))
else
:
sys
.
stdout
.
buffer
.
write
(
pickle
.
dumps
(
out_losses
))
def
runtime_main
(
test_class
):
...
...
@@ -149,7 +156,7 @@ def runtime_main(test_class):
parser
.
add_argument
(
'--use_cuda'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_reduce'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_reader_alloc'
,
action
=
'store_true'
,
required
=
False
,
default
=
True
)
'--use_reader_alloc'
,
action
=
'store_true'
,
required
=
False
)
parser
.
add_argument
(
'--batch_size'
,
required
=
False
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
'--batch_merge_repeat'
,
required
=
False
,
type
=
int
,
default
=
1
)
...
...
@@ -188,7 +195,7 @@ class TestDistBase(unittest.TestCase):
self
.
_pservers
=
2
self
.
_ps_endpoints
=
"127.0.0.1:%s,127.0.0.1:%s"
%
(
self
.
_find_free_port
(),
self
.
_find_free_port
())
self
.
_python_interp
=
"python"
self
.
_python_interp
=
sys
.
executable
self
.
_sync_mode
=
True
self
.
_enforce_place
=
None
self
.
_mem_opt
=
False
...
...
@@ -237,21 +244,6 @@ class TestDistBase(unittest.TestCase):
return
ps0_proc
,
ps1_proc
,
ps0_pipe
,
ps1_pipe
def
_wait_ps_ready
(
self
,
pid
):
retry_times
=
50
while
True
:
assert
retry_times
>=
0
,
"wait ps ready failed"
time
.
sleep
(
3
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
as
e
:
sys
.
stderr
.
write
(
'waiting for pserver: %s, left retry %d
\n
'
%
(
e
,
retry_times
))
retry_times
-=
1
def
_run_local
(
self
,
model
,
envs
,
...
...
@@ -288,23 +280,20 @@ class TestDistBase(unittest.TestCase):
env
=
envs
)
local_out
,
local_err
=
local_proc
.
communicate
()
local_ret
=
cpt
.
to_text
(
local_out
)
if
check_error_log
:
err_log
.
close
()
sys
.
stderr
.
write
(
'local_stdout: %s
\n
'
%
local_ret
)
sys
.
stderr
.
write
(
'local_stdout: %s
\n
'
%
pickle
.
loads
(
local_out
)
)
sys
.
stderr
.
write
(
'local_stderr: %s
\n
'
%
local_err
)
local_losses
=
local_ret
.
split
(
"
\n
"
)
return
local_losses
return
pickle
.
loads
(
local_out
)
def
_run_cluster
(
self
,
model
,
envs
,
check_error_log
):
# Run dist train to compare with local results
ps0
,
ps1
,
ps0_pipe
,
ps1_pipe
=
self
.
start_pserver
(
model
,
check_error_log
,
envs
)
self
.
_wait_ps_ready
(
ps0
.
pid
)
self
.
_wait_ps_ready
(
ps1
.
pid
)
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
tr_cmd
=
"%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
...
...
@@ -339,8 +328,8 @@ class TestDistBase(unittest.TestCase):
env0
.
update
(
envs
)
env1
.
update
(
envs
)
print
(
"tr0_cmd:{}
, env0: {}"
.
format
(
tr0_cmd
,
env0
))
print
(
"tr1_cmd:{}
, env1: {}"
.
format
(
tr1_cmd
,
env1
))
print
(
"tr0_cmd:{}
"
.
format
(
tr0_cmd
))
print
(
"tr1_cmd:{}
"
.
format
(
tr1_cmd
))
tr0_pipe
=
open
(
"/tmp/tr0_err.log"
,
"wb"
)
tr1_pipe
=
open
(
"/tmp/tr1_err.log"
,
"wb"
)
...
...
@@ -356,9 +345,7 @@ class TestDistBase(unittest.TestCase):
env
=
env1
)
tr0_out
,
tr0_err
=
tr0_proc
.
communicate
()
tr0_loss_text
=
cpt
.
to_text
(
tr0_out
)
tr1_out
,
tr1_err
=
tr1_proc
.
communicate
()
tr1_loss_text
=
cpt
.
to_text
(
tr1_out
)
# close trainer file
tr0_pipe
.
close
()
...
...
@@ -373,15 +360,13 @@ class TestDistBase(unittest.TestCase):
ps1
.
terminate
()
# print log
sys
.
stderr
.
write
(
'trainer 0 stdout:
\n
%s
\n
'
%
tr0_loss_text
)
sys
.
stderr
.
write
(
'trainer 0 stderr:
\n
%s
\n
'
%
tr0_err
)
sys
.
stderr
.
write
(
'trainer 1 stdout: %s
\n
'
%
tr1_loss_text
)
sys
.
stderr
.
write
(
'trainer 0 stdout:
%s
\n
'
%
pickle
.
loads
(
tr0_out
)
)
sys
.
stderr
.
write
(
'trainer 0 stderr: %s
\n
'
%
tr0_err
)
sys
.
stderr
.
write
(
'trainer 1 stdout: %s
\n
'
%
pickle
.
loads
(
tr1_out
)
)
sys
.
stderr
.
write
(
'trainer 1 stderr: %s
\n
'
%
tr1_err
)
tr0_losses
=
tr0_loss_text
.
split
(
"
\n
"
)
tr1_losses
=
tr1_loss_text
.
split
(
"
\n
"
)
return
tr0_losses
,
tr1_losses
# return tr0_losses, tr1_losses
return
pickle
.
loads
(
tr0_out
),
pickle
.
loads
(
tr1_out
)
def
check_with_place
(
self
,
model_file
,
...
...
@@ -411,9 +396,9 @@ class TestDistBase(unittest.TestCase):
check_error_log
)
for
step_id
in
range
(
RUN_STEP
):
local_loss
=
eval
(
local_losses
[
step_id
])[
0
]
tr0_loss
=
eval
(
tr0_losses
[
step_id
])[
0
]
tr1_loss
=
eval
(
tr1_losses
[
step_id
])[
0
]
dist_loss
=
(
tr0_loss
+
tr1_loss
)
/
2
print
(
str
(
local_loss
)
+
":"
+
str
(
dist_loss
)
)
self
.
assertAlmostEqual
(
local_loss
,
dist_loss
,
delta
=
delta
)
local_loss
=
local_losses
[
step_id
]
tr0_loss
=
tr0_losses
[
step_id
]
tr1_loss
=
tr1_losses
[
step_id
]
dist_loss
=
(
np
.
array
([
tr0_loss
])
+
np
.
array
([
tr1_loss
])
)
/
2
print
(
"======="
,
local_loss
,
":"
,
dist_loss
[
0
],
"======="
)
self
.
assertAlmostEqual
(
local_loss
,
dist_loss
[
0
]
,
delta
=
delta
)
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
浏览文件 @
cf1944af
...
...
@@ -23,16 +23,17 @@ class TestDistSeResneXt2x2(TestDistBase):
self
.
_use_reader_alloc
=
False
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
1
00
)
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
1
e-7
)
class
TestDistseResnXt2x2WithMemopt
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_mem_opt
=
True
self
.
_use_reader_alloc
=
False
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
1
00
)
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
1
e-7
)
class
TestDistSeResneXt2x2Async
(
TestDistBase
):
...
...
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
浏览文件 @
cf1944af
...
...
@@ -283,6 +283,25 @@ class TestDecayedAdagrad(TranspilerTest):
trainer
,
_
=
self
.
get_trainer
()
class
TestFtrl
(
TranspilerTest
):
def
net_conf
(
self
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1000
],
dtype
=
'float32'
)
y_predict
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
1000
,
act
=
None
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'fc_w'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'fc_b'
))
y
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
cost
=
fluid
.
layers
.
square_error_cost
(
input
=
y_predict
,
label
=
y
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
opt
=
fluid
.
optimizer
.
Ftrl
(
learning_rate
=
0.1
)
opt
.
minimize
(
avg_cost
)
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
,
_
=
self
.
get_trainer
()
class
TestLRDecayConditional
(
TranspilerTest
):
def
net_conf
(
self
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1000
],
dtype
=
'float32'
)
...
...
@@ -405,18 +424,43 @@ class TestL2DecayWithPiecewise(TranspilerTest):
[
"sum"
,
"scale"
,
"scale"
,
"elementwise_add"
,
"momentum"
])
class
TestEmptyPserverOptimizeBlocks
(
TranspilerTest
):
def
net_conf
(
self
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1000
],
dtype
=
'float32'
)
# only one parameter
y_predict
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
1000
,
act
=
None
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'fc_w'
),
bias_attr
=
False
)
y
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
cost
=
fluid
.
layers
.
square_error_cost
(
input
=
y_predict
,
label
=
y
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
1.0
)
sgd_optimizer
.
minimize
(
avg_cost
)
def
transpiler_test_impl
(
self
):
config
=
fluid
.
DistributeTranspilerConfig
()
config
.
slice_var_up
=
False
pserver
,
startup
=
self
.
get_pserver
(
ep
=
self
.
pserver2_ep
,
config
=
config
)
self
.
assertEqual
(
len
(
pserver
.
blocks
),
2
)
self
.
assertEqual
(
len
(
pserver
.
blocks
[
1
].
ops
),
0
)
class
TestDistLookupTableBase
(
TranspilerTest
):
def
network_with_table
(
self
,
is_sparse
,
is_distributed
):
self
.
table_size
=
1000
self
.
emb_size
=
64
self
.
lookup_table_name
=
'shared_w'
def
emb_pool
(
ids
):
def
emb_pool
(
ids
,
table_name
,
is_distributed
):
emb
=
fluid
.
layers
.
embedding
(
input
=
ids
,
size
=
[
self
.
table_size
,
self
.
emb_size
],
dtype
=
'float32'
,
param_attr
=
self
.
lookup_table_name
,
# share parameter
param_attr
=
table_name
,
is_sparse
=
is_sparse
,
is_distributed
=
is_distributed
)
pool
=
fluid
.
layers
.
sequence_pool
(
input
=
emb
,
pool_type
=
'average'
)
...
...
@@ -426,9 +470,13 @@ class TestDistLookupTableBase(TranspilerTest):
name
=
'title_ids'
,
shape
=
[
1
],
dtype
=
'int64'
,
lod_level
=
1
)
brand_ids
=
fluid
.
layers
.
data
(
name
=
'brand_ids'
,
shape
=
[
1
],
dtype
=
'int64'
,
lod_level
=
1
)
title_emb
=
emb_pool
(
title_ids
)
brand_emb
=
emb_pool
(
brand_ids
)
fc0
=
fluid
.
layers
.
concat
(
input
=
[
title_emb
,
brand_emb
],
axis
=
1
)
profile_ids
=
fluid
.
layers
.
data
(
name
=
'brand_ids'
,
shape
=
[
1
],
dtype
=
'int64'
,
lod_level
=
1
)
title_emb
=
emb_pool
(
title_ids
,
self
.
lookup_table_name
,
is_distributed
)
brand_emb
=
emb_pool
(
brand_ids
,
self
.
lookup_table_name
,
is_distributed
)
profile_emb
=
emb_pool
(
profile_ids
,
"profile_emb"
,
False
)
fc0
=
fluid
.
layers
.
concat
(
input
=
[
title_emb
,
brand_emb
,
profile_emb
],
axis
=
1
)
predict
=
fluid
.
layers
.
fc
(
input
=
fc0
,
size
=
2
,
act
=
None
,
...
...
@@ -449,7 +497,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
def
transpiler_test_impl
(
self
):
pserver1
,
startup1
=
self
.
get_pserver
(
self
.
pserver1_ep
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
3
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
4
)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
1
].
ops
],
...
...
@@ -459,16 +507,23 @@ class TestLocalLookupTable(TestDistLookupTableBase):
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"sum"
,
"scale"
,
"adam"
,
"scale"
,
"scale"
])
# 3 optimize for table 2 adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
3
].
ops
],
[
"sum"
,
"scale"
,
"adam"
,
"scale"
,
"scale"
])
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'lookup_table'
,
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_selected_rows'
,
'send'
,
'send_barrier'
,
'recv'
,
'recv'
,
'recv'
,
'fetch_barrier'
,
'concat'
'lookup_table'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'split_selected_rows'
,
'send'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_selected_rows'
,
'send'
,
'send_barrier'
,
'recv'
,
'recv'
,
'recv'
,
'recv'
,
'fetch_barrier'
,
'concat'
,
'concat'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
...
...
@@ -480,39 +535,45 @@ class TestDistLookupTable(TestDistLookupTableBase):
def
transpiler_test_impl
(
self
):
pserver1
,
startup1
=
self
.
get_pserver
(
self
.
pserver1_ep
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
5
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
6
)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
1
].
ops
],
[
"sum"
,
"scale"
,
"adam"
,
"scale"
,
"scale"
])
#
2 optimize for table sgd
#
4 prefetch -> lookup_sparse_table for data0
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"sum"
,
"scale"
,
"adam"
,
"scale"
,
"scale"
])
# 2 optimize for table sgd
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
3
].
ops
],
[
"sum"
,
"sgd"
])
# 3 prefetch -> lookup_sparse_table for data0
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
3
].
ops
],
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
4
].
ops
],
[
"lookup_sparse_table"
])
#
4
save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
4
].
ops
],
[
"save"
])
#
5
save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
5
].
ops
],
[
"save"
])
trainer
,
trainer_startup
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'split_ids'
,
'prefetch'
,
'merge_ids'
,
'sequence_pool'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_ids'
,
'send'
,
'send_barrier'
,
'recv'
,
'recv'
,
'fetch_barrier'
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'split_selected_rows'
,
'send'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_ids'
,
'send'
,
'send_barrier'
,
'recv'
,
'recv'
,
'recv'
,
'fetch_barrier'
,
'concat'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
startup_ops
=
[
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'uniform_random'
,
'recv'
,
'recv'
,
'fetch_barrier'
,
'fake_init'
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'uniform_random'
,
'uniform_random'
,
'recv'
,
'recv'
,
'recv'
,
'fetch_barrier'
,
'concat'
,
'fake_init'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer_startup
.
blocks
[
0
].
ops
],
startup_ops
)
...
...
@@ -526,7 +587,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
config
=
fluid
.
DistributeTranspilerConfig
()
pserver1
,
startup1
=
self
.
get_pserver
(
self
.
pserver1_ep
,
config
,
False
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
3
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
4
)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
1
].
ops
],
...
...
@@ -535,17 +596,23 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"adam"
,
"scale"
,
"scale"
])
# 3 optimize for table adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
3
].
ops
],
[
"adam"
,
"scale"
,
"scale"
])
trainer
,
_
=
self
.
get_trainer
(
config
)
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'lookup_table'
,
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_selected_rows'
,
'send'
,
'recv'
,
'recv'
,
'recv'
,
'concat'
'lookup_table'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'split_selected_rows'
,
'send'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_selected_rows'
,
'send'
,
'recv'
,
'recv'
,
'recv'
,
'recv'
,
'concat'
,
'concat'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
...
...
@@ -559,29 +626,34 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
pserver1
,
startup1
=
self
.
get_pserver
(
self
.
pserver1_ep
,
config
,
False
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
5
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
6
)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
1
].
ops
],
[
"adam"
,
"scale"
,
"scale"
])
# 2 optimize for table sgd
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"sgd"
])
# 3 prefetch -> lookup_sparse_table for data0
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
3
].
ops
],
# 2 optimize for table adam
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"adam"
,
"scale"
,
"scale"
])
# 3 optimize for table sgd
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
3
].
ops
],
[
"sgd"
])
# 4 prefetch -> lookup_sparse_table for data0
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
4
].
ops
],
[
"lookup_sparse_table"
])
#
4
save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
4
].
ops
],
[
"save"
])
#
5
save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
5
].
ops
],
[
"save"
])
trainer
,
_
=
self
.
get_trainer
(
config
)
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'split_ids'
,
'prefetch'
,
'merge_ids'
,
'sequence_pool'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_ids'
,
'send'
,
'recv'
,
'recv'
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'split_selected_rows'
,
'send'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_ids'
,
'send'
,
'recv'
,
'recv'
,
'recv'
,
'concat'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
...
...
python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
浏览文件 @
cf1944af
...
...
@@ -55,6 +55,46 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
exe
.
run
(
pserver_prog
)
def
run_pserver_with_empty_block
(
use_cuda
,
sync_mode
,
ip
,
port
,
trainers
,
trainer_id
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1
],
dtype
=
'float32'
)
y_predict
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
1
,
act
=
None
,
bias_attr
=
False
)
y
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
# loss function
cost
=
fluid
.
layers
.
square_error_cost
(
input
=
y_predict
,
label
=
y
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
# optimizer
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.001
)
sgd_optimizer
.
minimize
(
avg_cost
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
ps1
=
ip
+
":"
+
str
(
int
(
port
)
+
1
)
ps2
=
ip
+
":"
+
port
pserver_endpoints
=
ps1
+
","
+
ps2
config
=
fluid
.
DistributeTranspilerConfig
()
config
.
slice_var_up
=
False
t
=
fluid
.
DistributeTranspiler
(
config
=
config
)
t
.
transpile
(
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
,
sync_mode
=
sync_mode
)
pserver_prog
=
t
.
get_pserver_program
(
ps2
)
# pserver2 have no parameter
assert
(
len
(
pserver_prog
.
blocks
)
==
2
)
assert
(
len
(
pserver_prog
.
blocks
[
1
].
ops
)
==
0
)
pserver_startup
=
t
.
get_startup_program
(
ps2
,
pserver_prog
)
exe
.
run
(
pserver_startup
)
exe
.
run
(
pserver_prog
)
class
TestListenAndServOp
(
OpTest
):
def
setUp
(
self
):
self
.
ps_timeout
=
5
...
...
@@ -63,9 +103,9 @@ class TestListenAndServOp(OpTest):
self
.
trainers
=
1
self
.
trainer_id
=
0
def
_start_pserver
(
self
,
use_cuda
,
sync_mode
):
def
_start_pserver
(
self
,
use_cuda
,
sync_mode
,
pserver_func
):
p
=
Process
(
target
=
run_pserver
,
target
=
pserver_func
,
args
=
(
use_cuda
,
sync_mode
,
self
.
ip
,
self
.
port
,
self
.
trainers
,
self
.
trainer_id
))
p
.
daemon
=
True
...
...
@@ -92,7 +132,24 @@ class TestListenAndServOp(OpTest):
def
test_handle_signal_in_serv_op
(
self
):
# run pserver on CPU in sync mode
p1
=
self
.
_start_pserver
(
False
,
True
)
p1
=
self
.
_start_pserver
(
False
,
True
,
run_pserver
)
self
.
_wait_ps_ready
(
p1
.
pid
)
# raise SIGTERM to pserver
os
.
kill
(
p1
.
pid
,
signal
.
SIGINT
)
p1
.
join
()
# run pserver on CPU in async mode
p2
=
self
.
_start_pserver
(
False
,
False
,
run_pserver
)
self
.
_wait_ps_ready
(
p2
.
pid
)
# raise SIGTERM to pserver
os
.
kill
(
p2
.
pid
,
signal
.
SIGTERM
)
p2
.
join
()
def
test_list_and_serv_run_empty_optimize_block
(
self
):
# run pserver on CPU in sync mode
p1
=
self
.
_start_pserver
(
False
,
True
,
run_pserver_with_empty_block
)
self
.
_wait_ps_ready
(
p1
.
pid
)
# raise SIGTERM to pserver
...
...
@@ -100,7 +157,7 @@ class TestListenAndServOp(OpTest):
p1
.
join
()
# run pserver on CPU in async mode
p2
=
self
.
_start_pserver
(
False
,
False
)
p2
=
self
.
_start_pserver
(
False
,
False
,
run_pserver_with_empty_block
)
self
.
_wait_ps_ready
(
p2
.
pid
)
# raise SIGTERM to pserver
...
...
python/paddle/fluid/tests/unittests/test_seq_pool.py
浏览文件 @
cf1944af
...
...
@@ -184,6 +184,20 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
out
[
i
]
=
np
.
reshape
(
np
.
amax
(
sub_x
,
axis
=
0
),
(
3
,
11
))
class
TestSeqMaxPool2DInference
(
TestSeqMaxPool2D
):
def
compute
(
self
,
x
,
offset
,
out
):
self
.
attrs
=
{
'pooltype'
:
"MAX"
,
'is_test'
:
True
}
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
sub_x
=
np
.
reshape
(
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:],
(
-
1
,
3
*
11
))
out
[
i
]
=
np
.
reshape
(
np
.
amax
(
sub_x
,
axis
=
0
),
(
3
,
11
))
def
test_check_grad
(
self
):
"""Grad computation does not apply to Sequence MAX
Pool executed when is_test is true """
return
class
TestSeqLastPool2D
(
TestSeqAvgPool2D
):
def
compute
(
self
,
x
,
offset
,
out
):
self
.
attrs
=
{
'pooltype'
:
"LAST"
}
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
cf1944af
...
...
@@ -35,6 +35,7 @@ import sys
import
numpy
as
np
import
collections
import
six
import
logging
from
.ps_dispatcher
import
RoundRobin
,
HashName
,
PSDispatcher
from
..
import
core
,
framework
...
...
@@ -767,6 +768,15 @@ in a single call.")
prefetch_var_name_to_block_id
.
extend
(
lookup_table_var_name_to_block_id
)
if
len
(
optimize_blocks
)
==
0
:
logging
.
warn
(
"pserver ["
+
str
(
endpoint
)
+
"] has no optimize block!!"
)
pre_block_idx
=
pserver_program
.
num_blocks
-
1
empty_block
=
pserver_program
.
_create_block
(
pre_block_idx
)
optimize_blocks
.
append
(
empty_block
)
# In some case, some parameter server will have no parameter to optimize
# So we give an empty optimize block to parameter server.
attrs
=
{
"optimize_blocks"
:
optimize_blocks
,
"endpoint"
:
endpoint
,
...
...
@@ -1065,7 +1075,12 @@ to transpile() call.")
continue_search_lookup_table_op
=
False
all_ops
=
program
.
global_block
().
ops
for
op
in
all_ops
:
if
op
.
type
==
LOOKUP_TABLE_TYPE
:
if
op
.
type
==
LOOKUP_TABLE_TYPE
and
self
.
table_name
==
op
.
input
(
"W"
)[
0
]:
if
not
op
.
attr
(
'is_distributed'
):
raise
RuntimeError
(
"lookup_table_op that lookup an distributed embedding table"
"should set is_distributed to true"
)
continue_search_lookup_table_op
=
True
lookup_table_op_index
=
lookup_table_op_index
if
lookup_table_op_index
!=
-
1
else
list
(
...
...
@@ -1275,7 +1290,6 @@ to transpile() call.")
}
outputs
=
{
"ParamOut"
:
[
param_var
]}
# only support sgd now
import
logging
logging
.
warn
(
"distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of "
+
table_opt_op
.
type
)
...
...
@@ -1442,6 +1456,9 @@ to transpile() call.")
elif
op_type
==
"decayed_adagrad"
:
if
varkey
==
"Moment"
:
return
param_shape
elif
op_type
==
"ftrl"
:
if
varkey
in
[
"SquaredAccumulator"
,
"LinearAccumulator"
]:
return
param_shape
elif
op_type
==
"sgd"
:
pass
else
:
...
...
python/paddle/fluid/transpiler/inference_transpiler.py
浏览文件 @
cf1944af
...
...
@@ -61,6 +61,9 @@ class InferenceTranspiler(object):
raise
TypeError
(
"scope should be as Scope type or None"
)
use_mkldnn
=
bool
(
os
.
getenv
(
"FLAGS_use_mkldnn"
,
False
))
if
use_mkldnn
:
self
.
_depthwise_conv_mkldnn
(
program
)
self
.
_fuse_batch_norm
(
program
,
place
,
scope
)
if
use_mkldnn
:
self
.
_fuse_conv_bias_mkldnn
(
program
)
...
...
@@ -70,6 +73,31 @@ class InferenceTranspiler(object):
program
)
# ResNet residual block merging
self
.
_fuse_bn_relu_mkldnn
(
program
)
def
_depthwise_conv_mkldnn
(
self
,
program
):
'''
Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program.
The result is:
- before:
- any_other_op->depthwise_conv->any_other_op
- after:
- any_other_op->conv->any_other_op
:param program: program to transpile
:type program: Program
'''
self
.
block
=
program
.
block
(
0
)
i
=
0
while
i
<
len
(
self
.
block
.
ops
):
current_op
=
self
.
block
.
ops
[
i
]
if
current_op
.
type
==
'depthwise_conv2d'
:
current_op
.
desc
.
set_type
(
"conv2d"
)
i
=
i
+
1
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program
=
program
.
clone
()
def
_fuse_conv_eltwise_mkldnn
(
self
,
program
):
'''
Transpile the program fusing elementwise_add into conv for MKLDNN
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录