Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
d3e140a5
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d3e140a5
编写于
11月 28, 2018
作者:
N
nhzlx
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/paddlepaddle/paddle
into add_params_sync_pass
test=develop
上级
d666c8eb
dfd4a111
变更
114
展开全部
隐藏空白更改
内联
并排
Showing
114 changed file
with
509 addition
and
526 deletion
+509
-526
paddle/fluid/framework/data_device_transform.cc
paddle/fluid/framework/data_device_transform.cc
+2
-2
paddle/fluid/framework/data_device_transform_test.cu
paddle/fluid/framework/data_device_transform_test.cu
+3
-3
paddle/fluid/framework/details/broadcast_op_handle.cc
paddle/fluid/framework/details/broadcast_op_handle.cc
+1
-1
paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
...framework/details/modify_op_lock_and_record_event_pass.cc
+2
-2
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+6
-6
paddle/fluid/framework/details/reference_count_pass.cc
paddle/fluid/framework/details/reference_count_pass.cc
+2
-2
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+1
-1
paddle/fluid/framework/details/sequential_execution_pass.cc
paddle/fluid/framework/details/sequential_execution_pass.cc
+2
-2
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+4
-4
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+11
-11
paddle/fluid/framework/feed_fetch_method.cc
paddle/fluid/framework/feed_fetch_method.cc
+3
-3
paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+14
-14
paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+2
-2
paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+3
-3
paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+2
-2
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
+1
-1
paddle/fluid/framework/ir/fc_fuse_pass.cc
paddle/fluid/framework/ir/fc_fuse_pass.cc
+1
-1
paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+14
-14
paddle/fluid/framework/ir/graph.cc
paddle/fluid/framework/ir/graph.cc
+2
-2
paddle/fluid/framework/ir/graph.h
paddle/fluid/framework/ir/graph.h
+1
-1
paddle/fluid/framework/ir/graph_helper.cc
paddle/fluid/framework/ir/graph_helper.cc
+5
-6
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+10
-11
paddle/fluid/framework/ir/graph_viz_pass.cc
paddle/fluid/framework/ir/graph_viz_pass.cc
+1
-1
paddle/fluid/framework/ir/mkldnn_placement_pass.cc
paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+1
-1
paddle/fluid/framework/ir/multi_batch_merge_pass.cc
paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+4
-4
paddle/fluid/framework/ir/pass.h
paddle/fluid/framework/ir/pass.h
+1
-1
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+1
-1
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+1
-1
paddle/fluid/framework/lod_rank_table.cc
paddle/fluid/framework/lod_rank_table.cc
+1
-1
paddle/fluid/framework/mixed_vector_test.cc
paddle/fluid/framework/mixed_vector_test.cc
+1
-1
paddle/fluid/framework/op_desc.cc
paddle/fluid/framework/op_desc.cc
+18
-18
paddle/fluid/framework/op_registry.cc
paddle/fluid/framework/op_registry.cc
+3
-3
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+7
-8
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+1
-1
paddle/fluid/framework/scope.cc
paddle/fluid/framework/scope.cc
+1
-1
paddle/fluid/framework/selected_rows.cc
paddle/fluid/framework/selected_rows.cc
+1
-1
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+12
-12
paddle/fluid/framework/threadpool.cc
paddle/fluid/framework/threadpool.cc
+1
-1
paddle/fluid/framework/var_desc.cc
paddle/fluid/framework/var_desc.cc
+14
-14
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+7
-7
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+1
-1
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+4
-4
paddle/fluid/inference/api/demo_ci/utils.h
paddle/fluid/inference/api/demo_ci/utils.h
+5
-5
paddle/fluid/inference/api/details/reset_tensor_array.cc
paddle/fluid/inference/api/details/reset_tensor_array.cc
+2
-2
paddle/fluid/inference/io.cc
paddle/fluid/inference/io.cc
+2
-2
paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+1
-1
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+3
-3
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+4
-4
paddle/fluid/memory/allocation/legacy_allocator.cc
paddle/fluid/memory/allocation/legacy_allocator.cc
+9
-9
paddle/fluid/memory/detail/buddy_allocator.cc
paddle/fluid/memory/detail/buddy_allocator.cc
+28
-28
paddle/fluid/memory/detail/meta_cache.cc
paddle/fluid/memory/detail/meta_cache.cc
+1
-1
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+1
-1
paddle/fluid/operators/array_operator.h
paddle/fluid/operators/array_operator.h
+1
-1
paddle/fluid/operators/array_to_lod_tensor_op.cc
paddle/fluid/operators/array_to_lod_tensor_op.cc
+2
-2
paddle/fluid/operators/batch_norm_op.cu.cc
paddle/fluid/operators/batch_norm_op.cu.cc
+1
-1
paddle/fluid/operators/beam_search_op.cc
paddle/fluid/operators/beam_search_op.cc
+6
-6
paddle/fluid/operators/concat_op.cc
paddle/fluid/operators/concat_op.cc
+1
-1
paddle/fluid/operators/controlflow/feed_op.cc
paddle/fluid/operators/controlflow/feed_op.cc
+2
-2
paddle/fluid/operators/controlflow/fetch_op.cc
paddle/fluid/operators/controlflow/fetch_op.cc
+1
-1
paddle/fluid/operators/controlflow/parallel_do_op.cc
paddle/fluid/operators/controlflow/parallel_do_op.cc
+5
-5
paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
...fluid/operators/controlflow/tensor_array_read_write_op.cc
+7
-7
paddle/fluid/operators/controlflow/while_op.cc
paddle/fluid/operators/controlflow/while_op.cc
+9
-9
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+2
-2
paddle/fluid/operators/distributed/brpc_server.cc
paddle/fluid/operators/distributed/brpc_server.cc
+2
-2
paddle/fluid/operators/distributed/grpc_client.cc
paddle/fluid/operators/distributed/grpc_client.cc
+7
-7
paddle/fluid/operators/distributed/grpc_server.cc
paddle/fluid/operators/distributed/grpc_server.cc
+22
-23
paddle/fluid/operators/distributed/request_handler.h
paddle/fluid/operators/distributed/request_handler.h
+2
-2
paddle/fluid/operators/distributed/request_handler_impl.cc
paddle/fluid/operators/distributed/request_handler_impl.cc
+12
-13
paddle/fluid/operators/distributed/rpc_server.cc
paddle/fluid/operators/distributed/rpc_server.cc
+10
-10
paddle/fluid/operators/distributed/variable_response.cc
paddle/fluid/operators/distributed/variable_response.cc
+4
-4
paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
...e/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+2
-2
paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+1
-1
paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+5
-5
paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+17
-17
paddle/fluid/operators/distributed_ops/prefetch_op.cc
paddle/fluid/operators/distributed_ops/prefetch_op.cc
+3
-3
paddle/fluid/operators/distributed_ops/recv_op.cc
paddle/fluid/operators/distributed_ops/recv_op.cc
+1
-1
paddle/fluid/operators/distributed_ops/send_barrier_op.cc
paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+2
-2
paddle/fluid/operators/distributed_ops/send_op.cc
paddle/fluid/operators/distributed_ops/send_op.cc
+2
-2
paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
+2
-2
paddle/fluid/operators/distributed_ops/split_byref_op.h
paddle/fluid/operators/distributed_ops/split_byref_op.h
+1
-1
paddle/fluid/operators/distributed_ops/split_ids_op.h
paddle/fluid/operators/distributed_ops/split_ids_op.h
+1
-1
paddle/fluid/operators/lod_rank_table_op.cc
paddle/fluid/operators/lod_rank_table_op.cc
+2
-2
paddle/fluid/operators/lookup_table_op.cc
paddle/fluid/operators/lookup_table_op.cc
+4
-4
paddle/fluid/operators/math/cpu_vec_test.cc
paddle/fluid/operators/math/cpu_vec_test.cc
+2
-2
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+40
-51
paddle/fluid/operators/math/selected_rows_functor.cc
paddle/fluid/operators/math/selected_rows_functor.cc
+2
-2
paddle/fluid/operators/math/selected_rows_functor.cu
paddle/fluid/operators/math/selected_rows_functor.cu
+2
-2
paddle/fluid/operators/mul_op.cc
paddle/fluid/operators/mul_op.cc
+3
-3
paddle/fluid/operators/nccl/nccl_op.cu.cc
paddle/fluid/operators/nccl/nccl_op.cu.cc
+15
-16
paddle/fluid/operators/nccl/nccl_op_test.cu.cc
paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+7
-7
paddle/fluid/operators/nce_op.cc
paddle/fluid/operators/nce_op.cc
+7
-7
paddle/fluid/operators/optimizers/adam_op.h
paddle/fluid/operators/optimizers/adam_op.h
+1
-1
paddle/fluid/operators/optimizers/momentum_op.h
paddle/fluid/operators/optimizers/momentum_op.h
+1
-1
paddle/fluid/operators/optimizers/sgd_op.h
paddle/fluid/operators/optimizers/sgd_op.h
+4
-4
paddle/fluid/operators/random_crop_op.h
paddle/fluid/operators/random_crop_op.h
+2
-2
paddle/fluid/operators/reader/blocking_queue.h
paddle/fluid/operators/reader/blocking_queue.h
+2
-2
paddle/fluid/operators/reader/create_shuffle_reader_op.cc
paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+3
-3
paddle/fluid/operators/recurrent_op.cc
paddle/fluid/operators/recurrent_op.cc
+13
-13
paddle/fluid/operators/rnn_memory_helper_op.cc
paddle/fluid/operators/rnn_memory_helper_op.cc
+1
-1
paddle/fluid/operators/save_op.cc
paddle/fluid/operators/save_op.cc
+1
-1
paddle/fluid/operators/sequence_ops/sequence_mask_op.h
paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+1
-1
paddle/fluid/operators/sum_mkldnn_op.cc
paddle/fluid/operators/sum_mkldnn_op.cc
+1
-1
paddle/fluid/operators/sum_op.cc
paddle/fluid/operators/sum_op.cc
+3
-3
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+6
-6
paddle/fluid/platform/device_tracer.cc
paddle/fluid/platform/device_tracer.cc
+4
-4
paddle/fluid/platform/dynload/dynamic_loader.cc
paddle/fluid/platform/dynload/dynamic_loader.cc
+2
-2
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+2
-2
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+1
-1
paddle/fluid/platform/nccl_helper.h
paddle/fluid/platform/nccl_helper.h
+1
-1
paddle/fluid/pybind/protobuf.cc
paddle/fluid/pybind/protobuf.cc
+3
-3
paddle/fluid/train/demo/demo_trainer.cc
paddle/fluid/train/demo/demo_trainer.cc
+1
-1
paddle/legacy/pserver/ParameterClient2.cpp
paddle/legacy/pserver/ParameterClient2.cpp
+8
-8
paddle/testing/TestUtil.cpp
paddle/testing/TestUtil.cpp
+1
-1
python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
...ests/unittests/test_parallel_executor_test_while_train.py
+1
-1
未找到文件。
paddle/fluid/framework/data_device_transform.cc
浏览文件 @
d3e140a5
...
...
@@ -18,8 +18,8 @@ namespace framework {
void
TransDataDevice
(
const
Tensor
&
in
,
const
platform
::
Place
&
dst_place
,
Tensor
*
out
)
{
VLOG
(
3
0
)
<<
"DeviceTransform in, src_place "
<<
in
.
place
()
<<
" dst_place: "
<<
dst_place
;
VLOG
(
3
)
<<
"DeviceTransform in, src_place "
<<
in
.
place
()
<<
" dst_place: "
<<
dst_place
;
PADDLE_ENFORCE_NE
(
in
.
place
().
which
(),
dst_place
.
which
(),
...
...
paddle/fluid/framework/data_device_transform_test.cu
浏览文件 @
d3e140a5
...
...
@@ -49,10 +49,10 @@ class TestOpWithKernel : public OperatorWithKernel {
OpKernelType
GetExpectedKernelType
(
const
ExecutionContext
&
ctx
)
const
override
{
if
(
Attr
<
bool
>
(
"use_gpu"
))
{
VLOG
(
3
0
)
<<
"force use gpu kernel"
;
VLOG
(
3
)
<<
"force use gpu kernel"
;
return
OpKernelType
(
proto
::
VarType
::
FP32
,
platform
::
CUDAPlace
(
0
));
}
else
{
VLOG
(
3
0
)
<<
"use default kernel"
;
VLOG
(
3
)
<<
"use default kernel"
;
return
OpKernelType
(
proto
::
VarType
::
FP32
,
ctx
.
Input
<
Tensor
>
(
"input"
)
->
place
());
}
...
...
@@ -148,7 +148,7 @@ TEST(Operator, CPUtoGPU) {
// get output
auto
*
output2
=
scope
.
Var
(
"OUT2"
);
gpu_op
->
Run
(
scope
,
cuda_place
);
VLOG
(
3
0
)
<<
"after gpu_op run"
;
VLOG
(
3
)
<<
"after gpu_op run"
;
// auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
paddle
::
platform
::
DeviceContextPool
&
pool
=
...
...
paddle/fluid/framework/details/broadcast_op_handle.cc
浏览文件 @
d3e140a5
...
...
@@ -60,7 +60,7 @@ void BroadcastOpHandle::BroadcastOneVar(
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
Tensor
&
in_tensor
=
VariableVisitor
::
GetMutableTensor
(
in_var
);
if
(
UNLIKELY
(
!
in_tensor
.
IsInitialized
()))
{
VLOG
(
3
0
)
<<
"in var "
<<
in_var_handle
.
name_
<<
"not inited, return!"
;
VLOG
(
3
)
<<
"in var "
<<
in_var_handle
.
name_
<<
"not inited, return!"
;
return
;
}
...
...
paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -45,8 +45,8 @@ std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
IsLockAndRecordEventFreeComputationOpHandle
(
compute_op
,
graph_view
);
compute_op
->
SetLockAndRecordEventFree
(
is_lock_and_record_event_free
);
if
(
is_lock_and_record_event_free
)
{
VLOG
(
10
0
)
<<
"Set is_lock_and_record_event_free be true in op "
<<
compute_op
->
DebugString
();
VLOG
(
10
)
<<
"Set is_lock_and_record_event_free be true in op "
<<
compute_op
->
DebugString
();
}
}
return
ir_graph
;
...
...
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -399,7 +399,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
for
(
size_t
i
=
0
;
i
<
backward_vars
.
size
();
i
+=
2
)
{
auto
&
p_name
=
backward_vars
[
i
];
auto
&
g_name
=
backward_vars
[
i
+
1
];
VLOG
(
10
0
)
<<
"Bcast "
<<
g_name
<<
" for parameter "
<<
p_name
;
VLOG
(
10
)
<<
"Bcast "
<<
g_name
<<
" for parameter "
<<
p_name
;
switch
(
strategy_
.
reduce_
)
{
case
BuildStrategy
::
ReduceStrategy
::
kReduce
:
...
...
@@ -809,8 +809,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE_EQ
(
send_param_grad
.
size
(),
2U
);
op_dev_id
=
GetAppropriateDeviceID
({
send_param_grad
[
1
]});
VLOG
(
10
0
)
<<
"send grad "
<<
input_var_names
[
0
]
<<
" origin "
<<
send_param_grad
[
1
]
<<
" place: "
<<
op_dev_id
;
VLOG
(
10
)
<<
"send grad "
<<
input_var_names
[
0
]
<<
" origin "
<<
send_param_grad
[
1
]
<<
" place: "
<<
op_dev_id
;
for
(
auto
&
varname
:
input_var_names
)
{
sharded_var_device
->
emplace
(
varname
,
op_dev_id
);
}
...
...
@@ -826,9 +826,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
if
(
recv_param_grad
.
size
()
==
2U
)
{
op_dev_id
=
GetVarDeviceID
(
*
result
,
recv_param_grad
[
1
],
*
sharded_var_device
);
VLOG
(
10
0
)
<<
"recv param "
<<
recv_param_grad
[
0
]
<<
" get grad place: "
<<
recv_param_grad
[
1
]
<<
" place: "
<<
op_dev_id
;
VLOG
(
10
)
<<
"recv param "
<<
recv_param_grad
[
0
]
<<
" get grad place: "
<<
recv_param_grad
[
1
]
<<
" place: "
<<
op_dev_id
;
}
else
{
op_dev_id
=
GetAppropriateDeviceID
(
output_var_names
);
}
...
...
paddle/fluid/framework/details/reference_count_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -140,8 +140,8 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
if
(
next_compute_op
!=
nullptr
)
{
if
(
compute_ref_cnt_map
.
count
(
next_compute_op
))
{
compute_ref_cnt_map
[
next_compute_op
]
->
AddVar
(
var_name
);
VLOG
(
5
0
)
<<
"Add reference count of "
<<
var_name
<<
" to Operator "
<<
next_compute_op
->
Name
();
VLOG
(
5
)
<<
"Add reference count of "
<<
var_name
<<
" to Operator "
<<
next_compute_op
->
Name
();
}
else
{
// Create new reference_count_op_handle
ir
::
Node
*
ref_cnt_node
=
graph
->
CreateEmptyNode
(
...
...
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
浏览文件 @
d3e140a5
...
...
@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
->
stream
();
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
tmp
,
platform
::
CPUPlace
(),
&
coeff_
,
sizeof
(
float
),
stream
);
VLOG
(
10
0
)
<<
place_
<<
"RUN Scale loss grad op"
;
VLOG
(
10
)
<<
place_
<<
"RUN Scale loss grad op"
;
});
#endif
}
...
...
paddle/fluid/framework/details/sequential_execution_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -94,8 +94,8 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
op_node_list
[
i
-
1
]
->
outputs
.
push_back
(
dep_var
);
dep_var
->
outputs
.
push_back
(
op_node_list
[
i
]);
dep_var
->
inputs
.
push_back
(
op_node_list
[
i
-
1
]);
VLOG
(
10
0
)
<<
"Add dependencies between "
<<
op_node_list
[
i
-
1
]
->
Name
()
<<
" and "
<<
op_node_list
[
i
]
->
Name
();
VLOG
(
10
)
<<
"Add dependencies between "
<<
op_node_list
[
i
-
1
]
->
Name
()
<<
" and "
<<
op_node_list
[
i
]
->
Name
();
}
return
graph
;
}
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
d3e140a5
...
...
@@ -210,16 +210,16 @@ void ThreadedSSAGraphExecutor::RunOp(
details
::
OpHandleBase
*
op
)
{
auto
op_run
=
[
ready_var_q
,
op
,
this
]
{
try
{
if
(
VLOG_IS_ON
(
10
0
))
{
VLOG
(
10
0
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
if
(
VLOG_IS_ON
(
10
))
{
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
}
if
(
LIKELY
(
!
strategy_
.
dry_run_
))
{
op
->
Run
(
strategy_
.
use_cuda_
);
}
VLOG
(
10
0
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
running_ops_
--
;
ready_var_q
->
Extend
(
op
->
Outputs
());
VLOG
(
10
0
)
<<
op
<<
" "
<<
op
->
Name
()
<<
"Signal posted"
;
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
"Signal posted"
;
}
catch
(...)
{
exception_holder_
.
Catch
(
std
::
current_exception
());
}
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
d3e140a5
...
...
@@ -46,7 +46,7 @@ ExecutorPrepareContext::ExecutorPrepareContext(
}
ExecutorPrepareContext
::~
ExecutorPrepareContext
()
{
VLOG
(
5
0
)
<<
"destroy ExecutorPrepareContext"
;
VLOG
(
5
)
<<
"destroy ExecutorPrepareContext"
;
}
template
<
typename
RefCntMap
>
...
...
@@ -63,7 +63,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
if
((
it
->
second
)
--
==
1
)
{
auto
*
var
=
scope
.
FindVar
(
name
);
if
(
var
!=
nullptr
)
{
VLOG
(
10
0
)
<<
"Erase tensor
\'
"
<<
name
<<
"
\'
"
;
VLOG
(
10
)
<<
"Erase tensor
\'
"
<<
name
<<
"
\'
"
;
if
(
var
->
IsType
<
LoDTensor
>
())
{
erase_tensors
.
insert
(
var
->
GetMutable
<
LoDTensor
>
());
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
...
...
@@ -162,21 +162,21 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
if
(
var
->
Persistable
())
{
auto
*
ptr
=
const_cast
<
Scope
*>
(
ancestor_scope
)
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
0
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" global, which pointer is "
<<
ptr
;
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" global, which pointer is "
<<
ptr
;
}
else
{
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
0
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" locally, which pointer is "
<<
ptr
;
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" locally, which pointer is "
<<
ptr
;
}
}
}
else
{
for
(
auto
&
var
:
global_block
.
AllVars
())
{
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
0
)
<<
"Create variable "
<<
var
->
Name
()
<<
", which pointer is "
<<
ptr
;
VLOG
(
3
)
<<
"Create variable "
<<
var
->
Name
()
<<
", which pointer is "
<<
ptr
;
}
}
}
...
...
@@ -307,7 +307,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
int
i
=
0
;
for
(
auto
&
feed_target
:
(
*
feed_targets
))
{
std
::
string
var_name
=
feed_target
.
first
;
VLOG
(
3
0
)
<<
"feed target's name: "
<<
var_name
;
VLOG
(
3
)
<<
"feed target's name: "
<<
var_name
;
// prepend feed op
auto
*
op
=
global_block
->
PrependOp
();
...
...
@@ -330,7 +330,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
int
i
=
0
;
for
(
auto
&
fetch_target
:
(
*
fetch_targets
))
{
std
::
string
var_name
=
fetch_target
.
first
;
VLOG
(
3
0
)
<<
"fetch target's name: "
<<
var_name
;
VLOG
(
3
)
<<
"fetch target's name: "
<<
var_name
;
// append fetch op
auto
*
op
=
global_block
->
AppendOp
();
...
...
@@ -482,7 +482,7 @@ void Executor::RunPreparedContext(
void
Executor
::
EnableMKLDNN
(
const
ProgramDesc
&
program
)
{
#ifdef PADDLE_WITH_MKLDNN
VLOG
(
3
0
)
<<
"use_mkldnn=True"
;
VLOG
(
3
)
<<
"use_mkldnn=True"
;
for
(
size_t
bid
=
0
;
bid
<
program
.
Size
();
++
bid
)
{
auto
*
block
=
const_cast
<
ProgramDesc
&>
(
program
).
MutableBlock
(
bid
);
for
(
auto
*
op
:
block
->
AllOps
())
{
...
...
paddle/fluid/framework/feed_fetch_method.cc
浏览文件 @
d3e140a5
...
...
@@ -25,7 +25,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
const
std
::
string
&
var_name
,
size_t
index
)
{
// If var_name Variable is not found in GlobalScope, a new variable will
// be created.
VLOG
(
3
0
)
<<
"SetFeedVariable name="
<<
var_name
<<
" index="
<<
index
;
VLOG
(
3
)
<<
"SetFeedVariable name="
<<
var_name
<<
" index="
<<
index
;
Variable
*
g_feed_value
=
scope
->
Var
(
var_name
);
auto
&
feed_inputs
=
*
(
g_feed_value
->
GetMutable
<
FeedFetchList
>
());
if
(
index
>=
feed_inputs
.
size
())
{
...
...
@@ -47,8 +47,8 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
typeid
(
FeedFetchList
).
name
());
auto
&
fetch_outputs
=
*
g_fetch_value
->
GetMutable
<
FeedFetchList
>
();
auto
&
tensor
=
fetch_outputs
[
index
];
VLOG
(
3
0
)
<<
"Fetch "
<<
var_name
<<
" with index "
<<
index
<<
" shape= "
<<
tensor
.
dims
();
VLOG
(
3
)
<<
"Fetch "
<<
var_name
<<
" with index "
<<
index
<<
" shape= "
<<
tensor
.
dims
();
PADDLE_ENFORCE_LT
(
index
,
fetch_outputs
.
size
());
return
tensor
;
}
...
...
paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -147,19 +147,19 @@ void PrepareParameters(Graph* graph, const Param& param) {
scope
->
Var
(
param
.
LSTMX
)
->
GetMutable
<
LoDTensor
>
();
scope
->
Var
(
param
.
LSTMOUT
)
->
GetMutable
<
LoDTensor
>
();
#define GATE_W(name__)
\
auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0");
\
auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1");
\
auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0");
\
CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0);
\
VLOG(4
0
) << #name__ "_w0" \
<< " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
VLOG(4
0
) << #name__ "_w1" \
<< " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
VLOG(4
0
) << #name__ "_b0" \
<< " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>();
\
auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>();
\
#define GATE_W(name__) \
auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0"); \
auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1"); \
auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0"); \
CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \
VLOG(4) << #name__ "_w0" \
<< " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
VLOG(4) << #name__ "_w1" \
<< " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
VLOG(4) << #name__ "_b0" \
<< " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>(); \
auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>(); \
auto& W_##name__##_b0_t = W_##name__##_b0->Get<LoDTensor>();
GATE_W
(
forget
);
...
...
@@ -208,7 +208,7 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
int
D
=
W_forget_w0
.
dims
()[
0
];
int
M
=
W_forget_w1
.
dims
()[
0
];
out
->
Resize
(
make_ddim
({
D
+
M
,
4
*
D
}));
VLOG
(
3
0
)
<<
"LSTMWeight resized to "
<<
out
->
dims
();
VLOG
(
3
)
<<
"LSTMWeight resized to "
<<
out
->
dims
();
float
*
out_data
=
out
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
std
::
array
<
const
float
*
,
4
>
tensors
{
...
...
paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -57,7 +57,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
int
found_conv_bias_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
0
)
<<
"handle ConvBias fuse"
;
VLOG
(
4
)
<<
"handle ConvBias fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
conv_weight
,
conv_weight
,
conv_bias_pattern
);
// Filter
GET_IR_NODE_FROM_SUBGRAPH
(
conv_out
,
conv_out
,
conv_bias_pattern
);
// tmp
...
...
@@ -74,7 +74,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
// check if fuse can be done and if MKL-DNN should be used
FuseOptions
fuse_option
=
FindFuseOption
(
*
conv
,
*
eltwise
);
if
(
fuse_option
==
DO_NOT_FUSE
||
fuse_option
==
FUSE_NATIVE
)
{
VLOG
(
3
0
)
<<
"do not perform conv+bias fuse"
;
VLOG
(
3
)
<<
"do not perform conv+bias fuse"
;
return
;
}
...
...
paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -121,7 +121,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
int
found_conv_bn_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
0
)
<<
"handle ConvBN fuse"
;
VLOG
(
4
)
<<
"handle ConvBN fuse"
;
// conv, batch_norm,
// conv_weight, conv_out,
...
...
@@ -133,7 +133,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
// check if fuse can be done and if MKL-DNN should be used
FuseOptions
fuse_option
=
FindFuseOption
(
*
conv
,
*
batch_norm
);
if
(
fuse_option
==
DO_NOT_FUSE
)
{
VLOG
(
3
0
)
<<
"do not perform conv+bn fuse"
;
VLOG
(
3
)
<<
"do not perform conv+bn fuse"
;
return
;
}
...
...
@@ -241,7 +241,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
int
found_conv_bn_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
0
)
<<
"handle ConvBN fuse"
;
VLOG
(
4
)
<<
"handle ConvBN fuse"
;
// conv, batch_norm,
// conv_weight, conv_out,
...
...
paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
int
found_conv_relu_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
0
)
<<
"handle ConvReLU fuse"
;
VLOG
(
4
)
<<
"handle ConvReLU fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
conv_weight
,
conv_weight
,
conv_relu_pattern
);
// Filter
GET_IR_NODE_FROM_SUBGRAPH
(
conv_out
,
conv_out
,
conv_relu_pattern
);
// tmp
...
...
@@ -48,7 +48,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
FuseOptions
fuse_option
=
FindFuseOption
(
*
conv
,
*
relu
);
if
(
fuse_option
==
DO_NOT_FUSE
)
{
VLOG
(
3
0
)
<<
"do not perform conv+relu fuse"
;
VLOG
(
3
)
<<
"do not perform conv+relu fuse"
;
return
;
}
...
...
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
int
found_depthwise_conv_mkldnn_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
3
0
)
<<
"handle DepthwiseConvMKLDNN fuse"
;
VLOG
(
3
)
<<
"handle DepthwiseConvMKLDNN fuse"
;
GET_NODE
(
depthwise_conv
,
(
*
pattern
));
depthwise_conv
->
Op
()
->
SetType
(
"conv2d"
);
found_depthwise_conv_mkldnn_count
++
;
...
...
paddle/fluid/framework/ir/fc_fuse_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
int
found_fc_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
0
)
<<
"handle FC fuse"
;
VLOG
(
4
)
<<
"handle FC fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
w
,
w
,
fc_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
fc_bias
,
bias
,
fc_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
fc_out
,
Out
,
fc_pattern
);
...
...
paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -61,7 +61,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
0
)
<<
"handle FuseElewiseAddAct fuse"
;
VLOG
(
4
)
<<
"handle FuseElewiseAddAct fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
ele_y
,
ele_y
,
elewise_add_act_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
ele_out
,
elewise_add_out
,
elewise_add_act_pattern
);
...
...
@@ -77,10 +77,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
Node
*
elewise_add_act_node
=
CreateFuseElewiseAddActNode
(
g
,
act
,
ele_add
,
ele_x_n
,
ele_y_n
,
ele_out_n
,
act_out_n
);
VLOG
(
4
0
)
<<
"
\n\t
"
<<
ele_x_n
<<
" and "
<<
ele_y_n
<<
" -> "
<<
ele_add
->
Name
()
<<
" -> "
<<
ele_out_n
<<
"
\n
"
<<
"
\t
"
<<
ele_out_n
<<
" -> "
<<
act
->
Name
()
<<
" -> "
<<
act_out_n
;
VLOG
(
4
)
<<
"
\n\t
"
<<
ele_x_n
<<
" and "
<<
ele_y_n
<<
" -> "
<<
ele_add
->
Name
()
<<
" -> "
<<
ele_out_n
<<
"
\n
"
<<
"
\t
"
<<
ele_out_n
<<
" -> "
<<
act
->
Name
()
<<
" -> "
<<
act_out_n
;
ReLinkNodes
(
g
,
ele_out
,
ele_add
,
act
,
elewise_add_act_node
);
found_elewise_add_act_count
++
;
...
...
@@ -113,7 +113,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
0
)
<<
"handle FuseElewiseAddAct fuse"
;
VLOG
(
4
)
<<
"handle FuseElewiseAddAct fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
act_out
,
act_out
,
act_elewise_add_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
ele_x
,
ele_x
,
act_elewise_add_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
ele_out
,
elewise_add_out
,
...
...
@@ -129,9 +129,9 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
Node
*
elewise_add_act_node
=
CreateFuseElewiseAddActNode
(
g
,
ele_add
,
act
,
elewise_add_x_n
,
act_i_n
,
act_o_n
,
elewise_add_out_n
);
VLOG
(
4
0
)
<<
"
\n\t
"
<<
act_i_n
<<
" -> "
<<
act
->
Name
()
<<
" -> "
<<
act_o_n
<<
"
\n\t
"
<<
act_o_n
<<
" and "
<<
elewise_add_x_n
<<
" -> "
<<
ele_add
->
Name
()
<<
" -> "
<<
elewise_add_out_n
;
VLOG
(
4
)
<<
"
\n\t
"
<<
act_i_n
<<
" -> "
<<
act
->
Name
()
<<
" -> "
<<
act_o_n
<<
"
\n\t
"
<<
act_o_n
<<
" and "
<<
elewise_add_x_n
<<
" -> "
<<
ele_add
->
Name
()
<<
" -> "
<<
elewise_add_out_n
;
ReLinkNodes
(
g
,
act_out
,
act
,
ele_add
,
elewise_add_act_node
);
found_elewise_add_act_count
++
;
...
...
@@ -165,7 +165,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
0
)
<<
"handle FuseElewiseAddActGrad1 fuse"
;
VLOG
(
4
)
<<
"handle FuseElewiseAddActGrad1 fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
act_out
,
act_out
,
elewise_add_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
act_grad
,
act_grad
,
elewise_add_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
d_itermediate_out
,
d_itermediate_out
,
...
...
@@ -208,10 +208,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
auto
fused_node
=
g
->
CreateOpNode
(
&
desc
);
VLOG
(
4
0
)
<<
"
\n\t
"
<<
d_act_out_n
<<
" and "
<<
act_out_n
<<
" -> "
<<
act_grad
->
Name
()
<<
" -> "
<<
d_itermediate_out_n
<<
"
\n\t
"
<<
d_itermediate_out_n
<<
" and "
<<
act_out_n
<<
" -> "
<<
ele_add_grad
->
Name
()
<<
" -> "
<<
d_itermediate_out_n
;
VLOG
(
4
)
<<
"
\n\t
"
<<
d_act_out_n
<<
" and "
<<
act_out_n
<<
" -> "
<<
act_grad
->
Name
()
<<
" -> "
<<
d_itermediate_out_n
<<
"
\n\t
"
<<
d_itermediate_out_n
<<
" and "
<<
act_out_n
<<
" -> "
<<
ele_add_grad
->
Name
()
<<
" -> "
<<
d_itermediate_out_n
;
ReLinkNodes
(
g
,
d_itermediate_out
,
act_grad
,
ele_add_grad
,
fused_node
);
found_elewise_add_act_count
++
;
...
...
paddle/fluid/framework/ir/graph.cc
浏览文件 @
d3e140a5
...
...
@@ -90,7 +90,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
Graph
::
InitFromProgram
(
const
ProgramDesc
&
program
)
{
VLOG
(
3
0
)
<<
"block in program:"
<<
program_
.
Size
();
VLOG
(
3
)
<<
"block in program:"
<<
program_
.
Size
();
std
::
unordered_map
<
std
::
string
,
VarDesc
*>
all_vars
;
// var nodes for each var name, will have multiple versions in SSA
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
var_nodes
;
...
...
@@ -158,7 +158,7 @@ void Graph::ResolveHazard(
auto
it_old
=
versions
.
rbegin
();
++
it_old
;
for
(;
it_old
!=
versions
.
rend
();
it_new
=
it_old
,
++
it_old
)
{
VLOG
(
3
0
)
<<
"deal with var: "
<<
(
*
it_new
)
->
Name
();
VLOG
(
3
)
<<
"deal with var: "
<<
(
*
it_new
)
->
Name
();
ir
::
Node
*
write_op
=
(
*
it_new
)
->
inputs
.
empty
()
?
nullptr
:
(
*
it_new
)
->
inputs
[
0
];
const
auto
&
read_ops
=
(
*
it_old
)
->
outputs
;
...
...
paddle/fluid/framework/ir/graph.h
浏览文件 @
d3e140a5
...
...
@@ -89,7 +89,7 @@ class Graph {
attr_name
);
attrs_
[
attr_name
]
=
attr
;
attr_dels_
[
attr_name
]
=
[
attr
,
attr_name
]()
{
VLOG
(
3
0
)
<<
"deleting "
<<
attr_name
;
VLOG
(
3
)
<<
"deleting "
<<
attr_name
;
delete
attr
;
};
}
...
...
paddle/fluid/framework/ir/graph_helper.cc
浏览文件 @
d3e140a5
...
...
@@ -40,9 +40,8 @@ void SortHelper(
}
}
VLOG
(
30
)
<<
"topology sort insert: "
<<
node
->
Name
()
<<
reinterpret_cast
<
void
*>
(
node
)
<<
" input "
<<
node
->
inputs
.
size
();
VLOG
(
3
)
<<
"topology sort insert: "
<<
node
->
Name
()
<<
reinterpret_cast
<
void
*>
(
node
)
<<
" input "
<<
node
->
inputs
.
size
();
ret
->
push_back
(
node
);
}
...
...
@@ -111,9 +110,9 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
for
(
auto
&
var
:
n
->
inputs
)
{
for
(
auto
&
adj_n
:
var
->
inputs
)
{
PADDLE_ENFORCE
(
adj_n
->
NodeType
()
==
ir
::
Node
::
Type
::
kOperation
);
VLOG
(
4
0
)
<<
"adj "
<<
adj_n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
adj_n
)
<<
" -> "
<<
n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
n
)
<<
" via "
<<
var
->
Name
()
<<
reinterpret_cast
<
void
*>
(
var
);
VLOG
(
4
)
<<
"adj "
<<
adj_n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
adj_n
)
<<
" -> "
<<
n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
n
)
<<
" via "
<<
var
->
Name
()
<<
reinterpret_cast
<
void
*>
(
var
);
adj_list
[
n
].
insert
(
adj_n
);
}
}
...
...
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
d3e140a5
...
...
@@ -92,19 +92,19 @@ void GraphPatternDetector::operator()(Graph *graph,
PrettyLogEndl
(
Style
::
detail
(),
"--- detect %d subgraphs"
,
subgraphs
.
size
());
int
id
=
0
;
for
(
auto
&
g
:
subgraphs
)
{
VLOG
(
3
0
)
<<
"optimizing #"
<<
id
++
<<
" subgraph"
;
VLOG
(
3
)
<<
"optimizing #"
<<
id
++
<<
" subgraph"
;
handler
(
g
,
graph
);
}
}
bool
GraphPatternDetector
::
MarkPDNodesInGraph
(
const
ir
::
Graph
&
graph
)
{
VLOG
(
3
0
)
<<
"mark pdnodes in graph"
;
VLOG
(
3
)
<<
"mark pdnodes in graph"
;
if
(
graph
.
Nodes
().
empty
())
return
false
;
for
(
auto
&
node
:
GraphTraits
::
DFS
(
graph
))
{
for
(
const
auto
&
pdnode
:
pattern_
.
nodes
())
{
if
(
pdnode
->
Tell
(
&
node
))
{
VLOG
(
4
0
)
<<
"pdnode "
<<
pdnode
->
name
()
<<
" marked"
;
VLOG
(
4
)
<<
"pdnode "
<<
pdnode
->
name
()
<<
" marked"
;
pdnodes2nodes_
[
pdnode
.
get
()].
insert
(
&
node
);
}
}
...
...
@@ -112,7 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
// Check to early stop if some PDNode can't find matched Node.
for
(
auto
&
pdnode
:
pattern_
.
nodes
())
{
if
(
!
pdnodes2nodes_
.
count
(
pdnode
.
get
()))
{
VLOG
(
4
0
)
<<
pdnode
->
name
()
<<
" can't find matched Node, early stop"
;
VLOG
(
4
)
<<
pdnode
->
name
()
<<
" can't find matched Node, early stop"
;
// return false;
}
}
...
...
@@ -121,7 +121,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
GetMarkedNodes
(
const_cast
<
Graph
*>
(
&
graph
)).
insert
(
n
);
}
}
VLOG
(
3
0
)
<<
pdnodes2nodes_
.
size
()
<<
" nodes marked"
;
VLOG
(
3
)
<<
pdnodes2nodes_
.
size
()
<<
" nodes marked"
;
return
!
pdnodes2nodes_
.
empty
();
}
...
...
@@ -215,7 +215,7 @@ GraphPatternDetector::DetectPatterns() {
// Extend a PDNode to subgraphs by deducing the connection relations defined
// in edges of PDNodes.
for
(
const
auto
&
edge
:
pattern_
.
edges
())
{
VLOG
(
4
0
)
<<
"check "
<<
edge
.
first
->
name
()
<<
" -> "
<<
edge
.
second
->
name
();
VLOG
(
4
)
<<
"check "
<<
edge
.
first
->
name
()
<<
" -> "
<<
edge
.
second
->
name
();
// TODO(Superjomn) Fix bug here, the groups might be duplicate here.
// Each role has two PDNodes, which indicates two roles.
// Detect two Nodes that can match these two roles and they are connected.
...
...
@@ -226,7 +226,7 @@ GraphPatternDetector::DetectPatterns() {
// source -> target
for
(
Node
*
source
:
pdnodes2nodes_
[
edge
.
first
])
{
for
(
Node
*
target
:
pdnodes2nodes_
[
edge
.
second
])
{
VLOG
(
8
0
)
<<
"check "
<<
source
->
id
()
<<
" -- "
<<
target
->
id
();
VLOG
(
8
)
<<
"check "
<<
source
->
id
()
<<
" -- "
<<
target
->
id
();
// TODO(Superjomn) add some prune strategies.
for
(
const
auto
&
group
:
pre_groups
)
{
if
(
IsNodesLink
(
source
,
target
))
{
...
...
@@ -243,13 +243,12 @@ GraphPatternDetector::DetectPatterns() {
}
}
}
VLOG
(
3
0
)
<<
"step "
<<
step
<<
" get records: "
<<
cur_groups
.
size
();
VLOG
(
3
)
<<
"step "
<<
step
<<
" get records: "
<<
cur_groups
.
size
();
for
(
auto
&
group
:
cur_groups
)
{
for
(
auto
&
item
:
group
.
roles
)
{
VLOG
(
40
)
<<
"node "
<<
item
.
second
->
id
()
<<
" as "
<<
item
.
first
->
name
();
VLOG
(
4
)
<<
"node "
<<
item
.
second
->
id
()
<<
" as "
<<
item
.
first
->
name
();
}
VLOG
(
4
0
)
<<
"========================================================="
;
VLOG
(
4
)
<<
"========================================================="
;
}
}
...
...
paddle/fluid/framework/ir/graph_viz_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -41,7 +41,7 @@ std::string FormatName(const Node* node) {
std
::
unique_ptr
<
ir
::
Graph
>
GraphVizPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
const
std
::
string
graph_viz_path
=
Get
<
std
::
string
>
(
kGraphVizPath
);
VLOG
(
3
0
)
<<
"draw IR graph viz to "
<<
graph_viz_path
;
VLOG
(
3
)
<<
"draw IR graph viz to "
<<
graph_viz_path
;
std
::
unique_ptr
<
std
::
ostream
>
fout
(
new
std
::
ofstream
(
graph_viz_path
));
PADDLE_ENFORCE
(
fout
->
good
());
std
::
ostream
&
sout
=
*
fout
;
...
...
paddle/fluid/framework/ir/mkldnn_placement_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -20,7 +20,7 @@ namespace ir {
std
::
unique_ptr
<
ir
::
Graph
>
MKLDNNPlacementPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
VLOG
(
3
0
)
<<
"Aplies MKL-DNN placement strategy."
;
VLOG
(
3
)
<<
"Aplies MKL-DNN placement strategy."
;
for
(
const
Node
*
n
:
graph
->
Nodes
())
{
if
(
n
->
IsOp
()
&&
n
->
Op
()
->
HasAttr
(
"use_mkldnn"
))
{
n
->
Op
()
->
SetAttr
(
"use_mkldnn"
,
true
);
...
...
paddle/fluid/framework/ir/multi_batch_merge_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -62,7 +62,7 @@ VarDesc UpdateGradVarDesc(
string
::
Sprintf
(
"%s.repeat.%d"
,
var_desc
->
Name
(),
repeat
);
VarDesc
repeated_var
=
CopyVarDesc
(
var_desc
);
repeated_var
.
SetName
(
new_gname
);
VLOG
(
3
0
)
<<
"update "
<<
var_desc
->
Name
()
<<
" to repeat "
<<
repeat
;
VLOG
(
3
)
<<
"update "
<<
var_desc
->
Name
()
<<
" to repeat "
<<
repeat
;
return
repeated_var
;
}
return
*
var_desc
;
...
...
@@ -78,7 +78,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
std
::
vector
<
ir
::
Node
*>
nodes
=
TopologySortOperations
(
*
graph
);
auto
origin_nodes
=
graph
->
ReleaseNodes
();
VLOG
(
3
0
)
<<
"origin nodes count: "
<<
origin_nodes
.
size
();
VLOG
(
3
)
<<
"origin nodes count: "
<<
origin_nodes
.
size
();
ir
::
Graph
&
result
=
*
graph
;
// 1. record op nodes of different roles
...
...
@@ -137,8 +137,8 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
"%s.repeat.%d"
,
repeated_op
.
Input
(
"Variance"
)[
0
],
i
);
bn_vars_need_rename
.
insert
(
repeated_op
.
Input
(
"Mean"
)[
0
]);
bn_vars_need_rename
.
insert
(
repeated_op
.
Input
(
"Variance"
)[
0
]);
VLOG
(
3
0
)
<<
"renaming "
<<
repeated_op
.
Input
(
"Mean"
)[
0
]
<<
" to "
<<
new_mean_name
;
VLOG
(
3
)
<<
"renaming "
<<
repeated_op
.
Input
(
"Mean"
)[
0
]
<<
" to "
<<
new_mean_name
;
repeated_op
.
RenameInput
(
repeated_op
.
Input
(
"Mean"
)[
0
],
new_mean_name
);
repeated_op
.
RenameInput
(
repeated_op
.
Input
(
"Variance"
)[
0
],
new_var_name
);
repeated_op
.
RenameOutput
(
repeated_op
.
Output
(
"MeanOut"
)[
0
],
...
...
paddle/fluid/framework/ir/pass.h
浏览文件 @
d3e140a5
...
...
@@ -76,7 +76,7 @@ class Pass {
attr_name
);
attrs_
[
attr_name
]
=
attr
;
attr_dels_
[
attr_name
]
=
[
attr
,
attr_name
]()
{
VLOG
(
3
0
)
<<
"deleting "
<<
attr_name
;
VLOG
(
3
)
<<
"deleting "
<<
attr_name
;
delete
attr
;
};
}
...
...
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -196,7 +196,7 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
detector
(
graph
.
get
(),
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
graph
)
{
VLOG
(
4
0
)
<<
"get one concat pattern"
;
VLOG
(
4
)
<<
"get one concat pattern"
;
// fc
GET_NODE
(
fc_w
,
detector
.
pattern
());
GET_NODE
(
fc_bias
,
detector
.
pattern
());
...
...
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
浏览文件 @
d3e140a5
...
...
@@ -60,7 +60,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
0
)
<<
"handle SeqConv EltAdd Relu fuse"
;
VLOG
(
4
)
<<
"handle SeqConv EltAdd Relu fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
seqconv
,
seqconv
,
fuse_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
seqconv_weight
,
seqconv_weight
,
fuse_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
seqconv_out
,
seqconv_out
,
fuse_pattern
);
...
...
paddle/fluid/framework/lod_rank_table.cc
浏览文件 @
d3e140a5
...
...
@@ -31,7 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
TableItem
item
;
item
.
index
=
i
;
item
.
length
=
vec
[
i
+
1
]
-
vec
[
i
];
VLOG
(
10
0
)
<<
"Add item to rank table "
<<
item
.
index
<<
" "
<<
item
.
length
;
VLOG
(
10
)
<<
"Add item to rank table "
<<
item
.
index
<<
" "
<<
item
.
length
;
items_
.
emplace_back
(
item
);
}
// NOTE(yuyang18):
...
...
paddle/fluid/framework/mixed_vector_test.cc
浏览文件 @
d3e140a5
...
...
@@ -51,7 +51,7 @@ TEST(mixed_vector, InitWithCount) {
TEST
(
mixed_vector
,
ForEach
)
{
vec
<
int
>
tmp
;
for
(
auto
&
v
:
tmp
)
{
VLOG
(
3
0
)
<<
v
;
VLOG
(
3
)
<<
v
;
}
}
...
...
paddle/fluid/framework/op_desc.cc
浏览文件 @
d3e140a5
...
...
@@ -82,7 +82,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
auto
*
in_var
=
block_
.
FindVarRecursive
(
Inputs
(
in
)[
i
]);
auto
*
out_var
=
block_
.
FindVarRecursive
(
Outputs
(
out
)[
j
]);
if
(
in_var
->
GetType
()
!=
proto
::
VarType
::
LOD_TENSOR
)
{
VLOG
(
3
0
)
<<
"input "
<<
in
<<
" is not LodTensor"
;
VLOG
(
3
)
<<
"input "
<<
in
<<
" is not LodTensor"
;
return
;
}
out_var
->
SetLoDLevel
(
in_var
->
GetLoDLevel
());
...
...
@@ -241,38 +241,38 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
const
proto
::
OpProto
::
Attr
&
attr
=
GetProtoAttr
(
name
);
switch
(
attr
.
type
())
{
case
proto
::
AttrType
::
BOOLEANS
:
{
VLOG
(
11
0
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to BOOLEANS"
;
VLOG
(
11
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to BOOLEANS"
;
this
->
attrs_
[
name
]
=
std
::
vector
<
bool
>
();
break
;
}
case
proto
::
AttrType
::
INTS
:
{
VLOG
(
11
0
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to INTS"
;
VLOG
(
11
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to INTS"
;
this
->
attrs_
[
name
]
=
std
::
vector
<
int
>
();
break
;
}
case
proto
::
AttrType
::
LONGS
:
{
VLOG
(
11
0
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from LONGS to LONGS"
;
VLOG
(
11
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from LONGS to LONGS"
;
this
->
attrs_
[
name
]
=
std
::
vector
<
int64_t
>
();
break
;
}
case
proto
::
AttrType
::
FLOATS
:
{
VLOG
(
11
0
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to FLOATS"
;
VLOG
(
11
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to FLOATS"
;
this
->
attrs_
[
name
]
=
std
::
vector
<
float
>
();
break
;
}
case
proto
::
AttrType
::
STRINGS
:
{
VLOG
(
11
0
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to STRINGS"
;
VLOG
(
11
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to STRINGS"
;
this
->
attrs_
[
name
]
=
std
::
vector
<
std
::
string
>
();
break
;
}
case
proto
::
AttrType
::
BLOCKS
:
{
VLOG
(
11
0
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to BLOCKS"
;
VLOG
(
11
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to BLOCKS"
;
this
->
SetBlocksAttr
(
name
,
std
::
vector
<
BlockDesc
*>
());
return
;
}
...
...
@@ -505,13 +505,13 @@ void OpDesc::CheckAttrs() {
}
void
OpDesc
::
InferShape
(
const
BlockDesc
&
block
)
const
{
VLOG
(
3
0
)
<<
"CompileTime infer shape on "
<<
Type
();
VLOG
(
3
)
<<
"CompileTime infer shape on "
<<
Type
();
InitInferShapeFuncs
();
auto
&
infer_shape
=
OpInfoMap
::
Instance
().
Get
(
this
->
Type
()).
infer_shape_
;
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
infer_shape
),
"%s's infer_shape has not been registered"
,
this
->
Type
());
CompileTimeInferShapeContext
ctx
(
*
this
,
block
);
if
(
VLOG_IS_ON
(
10
0
))
{
if
(
VLOG_IS_ON
(
10
))
{
std
::
ostringstream
sout
;
auto
inames
=
this
->
InputArgumentNames
();
sout
<<
" From ["
;
...
...
@@ -522,7 +522,7 @@ void OpDesc::InferShape(const BlockDesc &block) const {
std
::
copy
(
onames
.
begin
(),
onames
.
end
(),
std
::
ostream_iterator
<
std
::
string
>
(
sout
,
", "
));
sout
<<
"]"
;
VLOG
(
10
0
)
<<
sout
.
str
();
VLOG
(
10
)
<<
sout
.
str
();
}
infer_shape
(
&
ctx
);
}
...
...
@@ -613,7 +613,7 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto
shape
=
var
->
GetShape
();
res
=
shape
.
empty
()
?
make_ddim
({
0UL
})
:
make_ddim
(
shape
);
}
catch
(...)
{
VLOG
(
5
0
)
<<
"GetDim of variable "
<<
name
<<
" error"
;
VLOG
(
5
)
<<
"GetDim of variable "
<<
name
<<
" error"
;
std
::
rethrow_exception
(
std
::
current_exception
());
}
return
res
;
...
...
@@ -630,7 +630,7 @@ std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
res
.
push_back
(
s
.
empty
()
?
make_ddim
({
0UL
})
:
make_ddim
(
s
));
}
}
catch
(...)
{
VLOG
(
5
0
)
<<
"GetRepeatedDim of variable "
<<
name
<<
" error."
;
VLOG
(
5
)
<<
"GetRepeatedDim of variable "
<<
name
<<
" error."
;
std
::
rethrow_exception
(
std
::
current_exception
());
}
return
res
;
...
...
paddle/fluid/framework/op_registry.cc
浏览文件 @
d3e140a5
...
...
@@ -46,9 +46,9 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
std
::
unique_ptr
<
OperatorBase
>
OpRegistry
::
CreateOp
(
const
proto
::
OpDesc
&
op_desc
)
{
VLOG
(
1
0
)
<<
"CreateOp directly from OpDesc is deprecated. It should only be"
"used in unit tests. Use CreateOp(const OpDesc& op_desc) "
"instead."
;
VLOG
(
1
)
<<
"CreateOp directly from OpDesc is deprecated. It should only be"
"used in unit tests. Use CreateOp(const OpDesc& op_desc) "
"instead."
;
VariableNameMap
inputs
=
ConvertOpDescVarsToVarNameMap
(
op_desc
.
inputs
());
VariableNameMap
outputs
=
ConvertOpDescVarsToVarNameMap
(
op_desc
.
outputs
());
AttributeMap
attrs
;
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
d3e140a5
...
...
@@ -139,7 +139,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
}
void
OperatorBase
::
Run
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
{
VLOG
(
4
0
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
VLOG
(
4
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
if
(
platform
::
is_gpu_place
(
place
))
{
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW
(
"Cannot run operator on place %s"
,
place
);
...
...
@@ -159,7 +159,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
}
else
{
RunImpl
(
scope
,
place
);
}
VLOG
(
3
0
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
VLOG
(
3
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
}
bool
OperatorBase
::
HasInputs
(
const
std
::
string
&
name
)
const
{
...
...
@@ -716,14 +716,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
auto
expected_kernel_key
=
this
->
GetExpectedKernelType
(
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
));
VLOG
(
3
0
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
#ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if
(
kernel_iter
==
kernels
.
end
()
&&
expected_kernel_key
.
library_type_
==
LibraryType
::
kMKLDNN
)
{
VLOG
(
3
0
)
<<
"missing MKLDNN kernel: fallbacking to PLAIN one"
;
VLOG
(
3
)
<<
"missing MKLDNN kernel: fallbacking to PLAIN one"
;
expected_kernel_key
.
library_type_
=
LibraryType
::
kPlain
;
expected_kernel_key
.
data_layout_
=
DataLayout
::
kAnyLayout
;
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
...
...
@@ -775,8 +775,7 @@ void OperatorWithKernel::TransferInplaceVarsBack(
const
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>&
inplace_vars
,
const
Scope
&
transfer_scope
)
const
{
for
(
auto
&
var_name
:
inplace_vars
)
{
VLOG
(
30
)
<<
"share inplace var "
+
var_name
+
" back to it's original scope"
;
VLOG
(
3
)
<<
"share inplace var "
+
var_name
+
" back to it's original scope"
;
auto
*
original_tensor
=
GetMutableLoDTensorOrSelectedRowsValueFromVar
(
scope
.
FindVar
(
var_name
));
auto
*
var
=
transfer_scope
.
FindVar
(
var_name
);
...
...
@@ -817,8 +816,8 @@ Scope* OperatorWithKernel::TryTransferData(
transfered_inplace_vars
->
emplace_back
(
var_name
);
}
VLOG
(
3
0
)
<<
"Transform Variable "
<<
var_name
<<
" from "
<<
kernel_type_for_var
<<
" to "
<<
expected_kernel_key
;
VLOG
(
3
)
<<
"Transform Variable "
<<
var_name
<<
" from "
<<
kernel_type_for_var
<<
" to "
<<
expected_kernel_key
;
// In the inference scenerio, the scopes will be reused across the
// batches, so the `new_scope` here will result in GPU memroy explosion
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
d3e140a5
...
...
@@ -208,7 +208,7 @@ void ParallelExecutor::BCastParamsToDevices(
auto
&
main_tensor
=
main_var
->
Get
<
LoDTensor
>
();
if
(
!
main_tensor
.
IsInitialized
())
{
VLOG
(
3
0
)
<<
"one in var not inited, return!"
;
VLOG
(
3
)
<<
"one in var not inited, return!"
;
continue
;
}
auto
&
dims
=
main_tensor
.
dims
();
...
...
paddle/fluid/framework/scope.cc
浏览文件 @
d3e140a5
...
...
@@ -162,7 +162,7 @@ Variable* Scope::VarInternal(const std::string& name) {
v
=
new
Variable
();
vars_
[
name
].
reset
(
v
);
VLOG
(
3
0
)
<<
"Create variable "
<<
name
;
VLOG
(
3
)
<<
"Create variable "
<<
name
;
v
->
name_
=
&
(
vars_
.
find
(
name
)
->
first
);
return
v
;
}
...
...
paddle/fluid/framework/selected_rows.cc
浏览文件 @
d3e140a5
...
...
@@ -206,7 +206,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
PADDLE_ENFORCE
(
value
->
IsInitialized
(),
"The value tensor should be initialized."
);
if
(
ids
.
numel
()
==
0
)
{
VLOG
(
3
0
)
<<
"keys is empty, please check data!"
;
VLOG
(
3
)
<<
"keys is empty, please check data!"
;
}
else
{
int64_t
value_width
=
value_
->
numel
()
/
value_
->
dims
()[
0
];
PADDLE_ENFORCE_EQ
(
value_width
,
value
->
numel
()
/
value
->
dims
()[
0
],
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
d3e140a5
...
...
@@ -22,8 +22,8 @@ namespace framework {
void
TensorCopy
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
dst
)
{
VLOG
(
3
0
)
<<
"TensorCopy "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
VLOG
(
3
)
<<
"TensorCopy "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
src
.
check_memory_size
();
dst
->
Resize
(
src
.
dims
());
...
...
@@ -37,8 +37,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
0
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
<<
dst_place
;
VLOG
(
3
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
...
...
@@ -77,8 +77,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
();
if
(
platform
::
is_same_place
(
src_place
,
dst_place
))
{
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
0
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
<<
dst_place
;
VLOG
(
3
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
...
...
@@ -114,8 +114,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
void
TensorCopySync
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
Tensor
*
dst
)
{
VLOG
(
3
0
)
<<
"TensorCopySync "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
VLOG
(
3
)
<<
"TensorCopySync "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
src
.
check_memory_size
();
dst
->
Resize
(
src
.
dims
());
dst
->
set_layout
(
src
.
layout
());
...
...
@@ -125,8 +125,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
0
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
<<
dst_place
;
VLOG
(
3
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
...
...
@@ -146,8 +146,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
if
(
src_ptr
==
dst_ptr
&&
platform
::
is_same_place
(
src_place
,
dst_place
))
{
VLOG
(
3
0
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
<<
dst_place
;
VLOG
(
3
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
...
...
paddle/fluid/framework/threadpool.cc
浏览文件 @
d3e140a5
...
...
@@ -39,7 +39,7 @@ void ThreadPool::Init() {
int
num_threads
=
std
::
thread
::
hardware_concurrency
();
if
(
FLAGS_dist_threadpool_size
>
0
)
{
num_threads
=
FLAGS_dist_threadpool_size
;
VLOG
(
1
0
)
<<
"set dist_threadpool_size to "
<<
num_threads
;
VLOG
(
1
)
<<
"set dist_threadpool_size to "
<<
num_threads
;
}
PADDLE_ENFORCE_GT
(
num_threads
,
0
);
threadpool_
.
reset
(
new
ThreadPool
(
num_threads
));
...
...
paddle/fluid/framework/var_desc.cc
浏览文件 @
d3e140a5
...
...
@@ -61,10 +61,10 @@ size_t VarDesc::GetTensorDescNum() const {
void
VarDesc
::
SetShapes
(
const
std
::
vector
<
std
::
vector
<
int64_t
>>
&
multiple_dims
)
{
if
(
multiple_dims
.
size
()
!=
GetTensorDescNum
())
{
VLOG
(
3
0
)
<<
"WARNING: The number of given shapes("
<<
multiple_dims
.
size
()
<<
") doesn't match the existing tensor number("
<<
GetTensorDescNum
()
<<
"). The Reader is going to be reinitialized."
;
VLOG
(
3
)
<<
"WARNING: The number of given shapes("
<<
multiple_dims
.
size
()
<<
") doesn't match the existing tensor number("
<<
GetTensorDescNum
()
<<
"). The Reader is going to be reinitialized."
;
SetTensorDescNum
(
multiple_dims
.
size
());
}
std
::
vector
<
proto
::
VarType
::
TensorDesc
*>
tensors
=
mutable_tensor_descs
();
...
...
@@ -94,11 +94,11 @@ void VarDesc::SetDataType(proto::VarType::Type data_type) {
void
VarDesc
::
SetDataTypes
(
const
std
::
vector
<
proto
::
VarType
::
Type
>
&
multiple_data_type
)
{
if
(
multiple_data_type
.
size
()
!=
GetTensorDescNum
())
{
VLOG
(
3
0
)
<<
"WARNING: The number of given data types("
<<
multiple_data_type
.
size
()
<<
") doesn't match the existing tensor number("
<<
GetTensorDescNum
()
<<
"). The Reader is going to be reinitialized."
;
VLOG
(
3
)
<<
"WARNING: The number of given data types("
<<
multiple_data_type
.
size
()
<<
") doesn't match the existing tensor number("
<<
GetTensorDescNum
()
<<
"). The Reader is going to be reinitialized."
;
SetTensorDescNum
(
multiple_data_type
.
size
());
}
std
::
vector
<
proto
::
VarType
::
TensorDesc
*>
tensor_descs
=
...
...
@@ -139,11 +139,11 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
void
VarDesc
::
SetLoDLevels
(
const
std
::
vector
<
int32_t
>
&
multiple_lod_level
)
{
if
(
multiple_lod_level
.
size
()
!=
GetTensorDescNum
())
{
VLOG
(
3
0
)
<<
"WARNING: The number of given lod_levels("
<<
multiple_lod_level
.
size
()
<<
") doesn't match the existing tensor number("
<<
GetTensorDescNum
()
<<
"). The Reader is going to be reinitialized."
;
VLOG
(
3
)
<<
"WARNING: The number of given lod_levels("
<<
multiple_lod_level
.
size
()
<<
") doesn't match the existing tensor number("
<<
GetTensorDescNum
()
<<
"). The Reader is going to be reinitialized."
;
SetTensorDescNum
(
multiple_lod_level
.
size
());
}
switch
(
desc_
.
type
().
type
())
{
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
d3e140a5
...
...
@@ -55,7 +55,7 @@ bool IsPersistable(const framework::VarDesc *var) {
bool
AnalysisPredictor
::
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
,
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
)
{
VLOG
(
3
0
)
<<
"Predictor::init()"
;
VLOG
(
3
)
<<
"Predictor::init()"
;
if
(
FLAGS_profile
)
{
LOG
(
WARNING
)
<<
"Profiler is actived, might affect the performance"
;
LOG
(
INFO
)
<<
"You can turn off by set gflags '-profile false'"
;
...
...
@@ -169,7 +169,7 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) {
bool
AnalysisPredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
VLOG
(
3
0
)
<<
"Predictor::predict"
;
VLOG
(
3
)
<<
"Predictor::predict"
;
inference
::
Timer
timer
;
timer
.
tic
();
// set feed variable
...
...
@@ -188,7 +188,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG
(
ERROR
)
<<
"fail to get fetches"
;
return
false
;
}
VLOG
(
3
0
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_
.
CollectTensorArrays
(
scope_
.
get
());
...
...
@@ -198,7 +198,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
bool
AnalysisPredictor
::
SetFeed
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
framework
::
Scope
*
scope
)
{
VLOG
(
3
0
)
<<
"Predictor::set_feed"
;
VLOG
(
3
)
<<
"Predictor::set_feed"
;
if
(
inputs
.
size
()
!=
feeds_
.
size
())
{
LOG
(
ERROR
)
<<
"wrong feed input size, need "
<<
feeds_
.
size
()
<<
" but get "
<<
inputs
.
size
();
...
...
@@ -275,7 +275,7 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
bool
AnalysisPredictor
::
GetFetch
(
std
::
vector
<
PaddleTensor
>
*
outputs
,
framework
::
Scope
*
scope
)
{
VLOG
(
3
0
)
<<
"Predictor::get_fetch"
;
VLOG
(
3
)
<<
"Predictor::get_fetch"
;
outputs
->
resize
(
fetchs_
.
size
());
for
(
size_t
i
=
0
;
i
<
fetchs_
.
size
();
++
i
)
{
int
idx
=
boost
::
get
<
int
>
(
fetchs_
[
i
]
->
GetAttr
(
"col"
));
...
...
@@ -339,7 +339,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
const
AnalysisConfig
&
config
)
{
VLOG
(
3
0
)
<<
"create AnalysisConfig"
;
VLOG
(
3
)
<<
"create AnalysisConfig"
;
if
(
config
.
use_gpu
)
{
// 1. GPU memeroy
PADDLE_ENFORCE_GT
(
...
...
@@ -353,7 +353,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
std
::
string
flag
=
"--fraction_of_gpu_memory_to_use="
+
std
::
to_string
(
config
.
fraction_of_gpu_memory
);
flags
.
push_back
(
flag
);
VLOG
(
3
0
)
<<
"set flag: "
<<
flag
;
VLOG
(
3
)
<<
"set flag: "
<<
flag
;
framework
::
InitGflags
(
flags
);
}
}
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
d3e140a5
...
...
@@ -152,7 +152,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG
(
ERROR
)
<<
"fail to get fetches"
;
return
false
;
}
VLOG
(
3
0
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_
.
CollectTensorArrays
(
scope_
.
get
());
...
...
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
浏览文件 @
d3e140a5
...
...
@@ -44,7 +44,7 @@ void Main() {
config
.
fraction_of_gpu_memory
=
0.1
;
// set by yourself
predictor
=
CreatePaddlePredictor
(
config
);
VLOG
(
3
0
)
<<
"begin to process data"
;
VLOG
(
3
)
<<
"begin to process data"
;
// Just a single batch of data.
std
::
string
line
;
std
::
ifstream
file
(
FLAGS_data
);
...
...
@@ -59,13 +59,13 @@ void Main() {
PaddleBuf
(
record
.
data
.
data
(),
record
.
data
.
size
()
*
sizeof
(
float
));
input
.
dtype
=
PaddleDType
::
FLOAT32
;
VLOG
(
3
0
)
<<
"run executor"
;
VLOG
(
3
)
<<
"run executor"
;
std
::
vector
<
PaddleTensor
>
output
;
predictor
->
Run
({
input
},
&
output
,
1
);
VLOG
(
3
0
)
<<
"output.size "
<<
output
.
size
();
VLOG
(
3
)
<<
"output.size "
<<
output
.
size
();
auto
&
tensor
=
output
.
front
();
VLOG
(
3
0
)
<<
"output: "
<<
SummaryTensor
(
tensor
);
VLOG
(
3
)
<<
"output: "
<<
SummaryTensor
(
tensor
);
// compare with reference result
CheckOutput
(
FLAGS_refer
,
tensor
);
...
...
paddle/fluid/inference/api/demo_ci/utils.h
浏览文件 @
d3e140a5
...
...
@@ -47,7 +47,7 @@ static void split(const std::string& str, char sep,
}
Record
ProcessALine
(
const
std
::
string
&
line
)
{
VLOG
(
3
0
)
<<
"process a line"
;
VLOG
(
3
)
<<
"process a line"
;
std
::
vector
<
std
::
string
>
columns
;
split
(
line
,
'\t'
,
&
columns
);
CHECK_EQ
(
columns
.
size
(),
2UL
)
...
...
@@ -65,8 +65,8 @@ Record ProcessALine(const std::string& line) {
for
(
auto
&
s
:
shape_strs
)
{
record
.
shape
.
push_back
(
std
::
stoi
(
s
));
}
VLOG
(
3
0
)
<<
"data size "
<<
record
.
data
.
size
();
VLOG
(
3
0
)
<<
"data shape size "
<<
record
.
shape
.
size
();
VLOG
(
3
)
<<
"data size "
<<
record
.
data
.
size
();
VLOG
(
3
)
<<
"data shape size "
<<
record
.
shape
.
size
();
return
record
;
}
...
...
@@ -78,8 +78,8 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
file
.
close
();
size_t
numel
=
output
.
data
.
length
()
/
PaddleDtypeSize
(
output
.
dtype
);
VLOG
(
3
0
)
<<
"predictor output numel "
<<
numel
;
VLOG
(
3
0
)
<<
"reference output numel "
<<
refer
.
data
.
size
();
VLOG
(
3
)
<<
"predictor output numel "
<<
numel
;
VLOG
(
3
)
<<
"reference output numel "
<<
refer
.
data
.
size
();
CHECK_EQ
(
numel
,
refer
.
data
.
size
());
switch
(
output
.
dtype
)
{
case
PaddleDType
::
INT64
:
{
...
...
paddle/fluid/inference/api/details/reset_tensor_array.cc
浏览文件 @
d3e140a5
...
...
@@ -26,7 +26,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
// parameter.
if
(
var_name
==
"feed"
||
var_name
==
"fetch"
)
continue
;
if
(
var
->
Type
()
==
typeid
(
framework
::
LoDTensorArray
))
{
VLOG
(
4
0
)
<<
"collect "
<<
var_name
;
VLOG
(
4
)
<<
"collect "
<<
var_name
;
arrays_
.
push_back
(
var
->
GetMutable
<
framework
::
LoDTensorArray
>
());
}
}
...
...
@@ -34,7 +34,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
CollectTensorArrays
(
kid
);
}
VLOG
(
3
0
)
<<
"Collect "
<<
arrays_
.
size
()
<<
" arrays"
;
VLOG
(
3
)
<<
"Collect "
<<
arrays_
.
size
()
<<
" arrays"
;
flag_
=
false
;
}
}
...
...
paddle/fluid/inference/io.cc
浏览文件 @
d3e140a5
...
...
@@ -78,7 +78,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
for
(
auto
*
var
:
global_block
.
AllVars
())
{
if
(
IsPersistable
(
var
))
{
VLOG
(
3
0
)
<<
"persistable variable's name: "
<<
var
->
Name
();
VLOG
(
3
)
<<
"persistable variable's name: "
<<
var
->
Name
();
framework
::
VarDesc
*
new_var
=
load_block
->
Var
(
var
->
Name
());
new_var
->
SetShape
(
var
->
GetShape
());
...
...
@@ -121,7 +121,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
const
std
::
string
&
dirname
)
{
std
::
string
model_filename
=
dirname
+
"/__model__"
;
std
::
string
program_desc_str
;
VLOG
(
3
0
)
<<
"loading model from "
<<
model_filename
;
VLOG
(
3
)
<<
"loading model from "
<<
model_filename
;
ReadBinaryFile
(
model_filename
,
&
program_desc_str
);
std
::
unique_ptr
<
framework
::
ProgramDesc
>
main_program
(
...
...
paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
浏览文件 @
d3e140a5
...
...
@@ -53,7 +53,7 @@ class Pool2dOpConverter : public OpConverter {
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
4
0
)
VLOG
(
4
)
<<
"convert a fluid pool2d op to tensorrt pool2d layer without bias"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
...
...
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
浏览文件 @
d3e140a5
...
...
@@ -27,7 +27,7 @@ struct Record {
};
Record
ProcessALine
(
const
std
::
string
&
line
)
{
VLOG
(
3
0
)
<<
"process a line"
;
VLOG
(
3
)
<<
"process a line"
;
std
::
vector
<
std
::
string
>
columns
;
split
(
line
,
'\t'
,
&
columns
);
CHECK_EQ
(
columns
.
size
(),
2UL
)
...
...
@@ -45,8 +45,8 @@ Record ProcessALine(const std::string &line) {
for
(
auto
&
s
:
shape_strs
)
{
record
.
shape
.
push_back
(
std
::
stoi
(
s
));
}
VLOG
(
3
0
)
<<
"data size "
<<
record
.
data
.
size
();
VLOG
(
3
0
)
<<
"data shape size "
<<
record
.
shape
.
size
();
VLOG
(
3
)
<<
"data size "
<<
record
.
data
.
size
();
VLOG
(
3
)
<<
"data shape size "
<<
record
.
shape
.
size
();
return
record
;
}
...
...
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
d3e140a5
...
...
@@ -76,12 +76,12 @@ class ChunkedAllocator : public Allocator {
default_allocator_
=
raw_allocator_
;
}
else
{
if
(
capacity
==
1
)
{
VLOG
(
1
0
)
<<
"Create BestFitAllocator with chunk_size "
<<
max_chunk_size_
;
VLOG
(
1
)
<<
"Create BestFitAllocator with chunk_size "
<<
max_chunk_size_
;
default_allocator_
=
CreateAllocatorWithChunk
();
}
else
{
VLOG
(
1
0
)
<<
"Create AutoIncrementAllocator with chunk_size "
<<
max_chunk_size_
<<
" and capacity "
<<
capacity
;
VLOG
(
1
)
<<
"Create AutoIncrementAllocator with chunk_size "
<<
max_chunk_size_
<<
" and capacity "
<<
capacity
;
default_allocator_
=
std
::
make_shared
<
AutoIncrementAllocator
>
(
[
this
]
{
return
std
::
move
(
CreateAllocatorWithChunk
());
},
capacity
);
}
...
...
paddle/fluid/memory/allocation/legacy_allocator.cc
浏览文件 @
d3e140a5
...
...
@@ -86,18 +86,18 @@ struct NaiveAllocator {
template
<
>
void
*
Alloc
<
platform
::
CPUPlace
>
(
const
platform
::
CPUPlace
&
place
,
size_t
size
)
{
VLOG
(
1
0
)
<<
"Allocate "
<<
size
<<
" bytes on "
<<
platform
::
Place
(
place
);
VLOG
(
1
)
<<
"Allocate "
<<
size
<<
" bytes on "
<<
platform
::
Place
(
place
);
void
*
p
=
GetCPUBuddyAllocator
()
->
Alloc
(
size
);
if
(
FLAGS_init_allocated_mem
)
{
memset
(
p
,
0xEF
,
size
);
}
VLOG
(
10
0
)
<<
" pointer="
<<
p
;
VLOG
(
10
)
<<
" pointer="
<<
p
;
return
p
;
}
template
<
>
void
Free
<
platform
::
CPUPlace
>
(
const
platform
::
CPUPlace
&
place
,
void
*
p
)
{
VLOG
(
1
0
)
<<
"Free pointer="
<<
p
<<
" on "
<<
platform
::
Place
(
place
);
VLOG
(
1
)
<<
"Free pointer="
<<
p
<<
" on "
<<
platform
::
Place
(
place
);
GetCPUBuddyAllocator
()
->
Free
(
p
);
}
...
...
@@ -124,12 +124,12 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
GPUAllocator
(
i
)),
platform
::
GpuMinChunkSize
(),
platform
::
GpuMaxChunkSize
());
VLOG
(
10
0
)
<<
"
\n\n
NOTE: each GPU device use "
<<
FLAGS_fraction_of_gpu_memory_to_use
*
100
<<
"% of GPU memory.
\n
"
<<
"You can set GFlags environment variable '"
<<
"FLAGS_fraction_of_gpu_memory_to_use"
<<
"' to change the fraction of GPU usage.
\n\n
"
;
VLOG
(
10
)
<<
"
\n\n
NOTE: each GPU device use "
<<
FLAGS_fraction_of_gpu_memory_to_use
*
100
<<
"% of GPU memory.
\n
"
<<
"You can set GFlags environment variable '"
<<
"FLAGS_fraction_of_gpu_memory_to_use"
<<
"' to change the fraction of GPU usage.
\n\n
"
;
}
});
...
...
paddle/fluid/memory/detail/buddy_allocator.cc
浏览文件 @
d3e140a5
...
...
@@ -32,11 +32,11 @@ BuddyAllocator::BuddyAllocator(
system_allocator_
(
std
::
move
(
system_allocator
))
{}
BuddyAllocator
::~
BuddyAllocator
()
{
VLOG
(
10
0
)
<<
"BuddyAllocator Disconstructor makes sure that all of these "
"have actually been freed"
;
VLOG
(
10
)
<<
"BuddyAllocator Disconstructor makes sure that all of these "
"have actually been freed"
;
while
(
!
pool_
.
empty
())
{
auto
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
pool_
.
begin
()));
VLOG
(
10
0
)
<<
"Free from block ("
<<
block
<<
", "
<<
max_chunk_size_
<<
")"
;
VLOG
(
10
)
<<
"Free from block ("
<<
block
<<
", "
<<
max_chunk_size_
<<
")"
;
system_allocator_
->
Free
(
block
,
max_chunk_size_
,
block
->
index
(
cache_
));
cache_
.
invalidate
(
block
);
...
...
@@ -57,12 +57,12 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// acquire the allocator lock
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
VLOG
(
10
0
)
<<
"Allocate "
<<
unaligned_size
<<
" bytes from chunk size "
<<
size
;
VLOG
(
10
)
<<
"Allocate "
<<
unaligned_size
<<
" bytes from chunk size "
<<
size
;
// if the allocation is huge, send directly to the system allocator
if
(
size
>
max_chunk_size_
)
{
VLOG
(
10
0
)
<<
"Allocate from system allocator."
;
VLOG
(
10
)
<<
"Allocate from system allocator."
;
return
SystemAlloc
(
size
);
}
...
...
@@ -77,9 +77,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
return
nullptr
;
}
}
else
{
VLOG
(
10
0
)
<<
"Allocation from existing memory block "
<<
std
::
get
<
2
>
(
*
it
)
<<
" at address "
<<
reinterpret_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
it
))
->
data
();
VLOG
(
10
)
<<
"Allocation from existing memory block "
<<
std
::
get
<
2
>
(
*
it
)
<<
" at address "
<<
reinterpret_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
it
))
->
data
();
}
total_used_
+=
size
;
...
...
@@ -96,10 +96,10 @@ void BuddyAllocator::Free(void* p) {
// Acquire the allocator lock
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
VLOG
(
10
0
)
<<
"Free from address "
<<
block
;
VLOG
(
10
)
<<
"Free from address "
<<
block
;
if
(
block
->
type
(
cache_
)
==
MemoryBlock
::
HUGE_CHUNK
)
{
VLOG
(
10
0
)
<<
"Free directly from system allocator"
;
VLOG
(
10
)
<<
"Free directly from system allocator"
;
system_allocator_
->
Free
(
block
,
block
->
total_size
(
cache_
),
block
->
index
(
cache_
));
...
...
@@ -116,8 +116,8 @@ void BuddyAllocator::Free(void* p) {
// Trying to merge the right buddy
if
(
block
->
has_right_buddy
(
cache_
))
{
VLOG
(
10
0
)
<<
"Merging this block "
<<
block
<<
" with its right buddy "
<<
block
->
right_buddy
(
cache_
);
VLOG
(
10
)
<<
"Merging this block "
<<
block
<<
" with its right buddy "
<<
block
->
right_buddy
(
cache_
);
auto
right_buddy
=
block
->
right_buddy
(
cache_
);
...
...
@@ -134,8 +134,8 @@ void BuddyAllocator::Free(void* p) {
// Trying to merge the left buddy
if
(
block
->
has_left_buddy
(
cache_
))
{
VLOG
(
10
0
)
<<
"Merging this block "
<<
block
<<
" with its left buddy "
<<
block
->
left_buddy
(
cache_
);
VLOG
(
10
)
<<
"Merging this block "
<<
block
<<
" with its left buddy "
<<
block
->
left_buddy
(
cache_
);
auto
left_buddy
=
block
->
left_buddy
(
cache_
);
...
...
@@ -151,8 +151,8 @@ void BuddyAllocator::Free(void* p) {
}
// Dumping this block into pool
VLOG
(
10
0
)
<<
"Inserting free block ("
<<
block
<<
", "
<<
block
->
total_size
(
cache_
)
<<
")"
;
VLOG
(
10
)
<<
"Inserting free block ("
<<
block
<<
", "
<<
block
->
total_size
(
cache_
)
<<
")"
;
pool_
.
insert
(
IndexSizeAddress
(
block
->
index
(
cache_
),
block
->
total_size
(
cache_
),
block
));
...
...
@@ -174,7 +174,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
size_t
index
=
0
;
void
*
p
=
system_allocator_
->
Alloc
(
&
index
,
size
);
VLOG
(
10
0
)
<<
"Allocated "
<<
p
<<
" from system allocator."
;
VLOG
(
10
)
<<
"Allocated "
<<
p
<<
" from system allocator."
;
if
(
p
==
nullptr
)
return
nullptr
;
...
...
@@ -200,8 +200,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
if
(
p
==
nullptr
)
return
pool_
.
end
();
VLOG
(
10
0
)
<<
"Creating and inserting new block "
<<
p
<<
" from system allocator"
;
VLOG
(
10
)
<<
"Creating and inserting new block "
<<
p
<<
" from system allocator"
;
static_cast
<
MemoryBlock
*>
(
p
)
->
init
(
&
cache_
,
MemoryBlock
::
FREE_CHUNK
,
index
,
max_chunk_size_
,
nullptr
,
nullptr
);
...
...
@@ -245,19 +245,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
auto
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
it
));
pool_
.
erase
(
it
);
VLOG
(
10
0
)
<<
"Split block ("
<<
block
<<
", "
<<
block
->
total_size
(
cache_
)
<<
") into"
;
VLOG
(
10
)
<<
"Split block ("
<<
block
<<
", "
<<
block
->
total_size
(
cache_
)
<<
") into"
;
block
->
split
(
&
cache_
,
size
);
VLOG
(
10
0
)
<<
"Left block ("
<<
block
<<
", "
<<
block
->
total_size
(
cache_
)
<<
")"
;
VLOG
(
10
)
<<
"Left block ("
<<
block
<<
", "
<<
block
->
total_size
(
cache_
)
<<
")"
;
block
->
set_type
(
&
cache_
,
MemoryBlock
::
ARENA_CHUNK
);
// the rest of memory if exist
if
(
block
->
has_right_buddy
(
cache_
))
{
if
(
block
->
right_buddy
(
cache_
)
->
type
(
cache_
)
==
MemoryBlock
::
FREE_CHUNK
)
{
VLOG
(
10
0
)
<<
"Insert right block ("
<<
block
->
right_buddy
(
cache_
)
<<
", "
<<
block
->
right_buddy
(
cache_
)
->
total_size
(
cache_
)
<<
")"
;
VLOG
(
10
)
<<
"Insert right block ("
<<
block
->
right_buddy
(
cache_
)
<<
", "
<<
block
->
right_buddy
(
cache_
)
->
total_size
(
cache_
)
<<
")"
;
pool_
.
insert
(
IndexSizeAddress
(
block
->
right_buddy
(
cache_
)
->
index
(
cache_
),
...
...
@@ -284,7 +284,7 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
return
;
}
VLOG
(
10
0
)
<<
"Return block "
<<
block
<<
" to fallback allocator."
;
VLOG
(
10
)
<<
"Return block "
<<
block
<<
" to fallback allocator."
;
system_allocator_
->
Free
(
block
,
max_chunk_size_
,
block
->
index
(
cache_
));
cache_
.
invalidate
(
block
);
...
...
@@ -320,7 +320,7 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
MemoryBlock
*
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
pool
));
VLOG
(
10
0
)
<<
"Return block "
<<
block
<<
" to base allocator."
;
VLOG
(
10
)
<<
"Return block "
<<
block
<<
" to base allocator."
;
system_allocator_
->
Free
(
block
,
max_chunk_size_
,
block
->
index
(
cache_
));
cache_
.
invalidate
(
block
);
...
...
paddle/fluid/memory/detail/meta_cache.cc
浏览文件 @
d3e140a5
...
...
@@ -29,7 +29,7 @@ MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
return
existing_desc
->
second
;
}
else
{
auto
*
desc
=
reinterpret_cast
<
const
MemoryBlock
::
Desc
*>
(
block
);
VLOG
(
10
0
)
<<
"Load MemoryBlock::Desc type="
<<
desc
->
type
;
VLOG
(
10
)
<<
"Load MemoryBlock::Desc type="
<<
desc
->
type
;
PADDLE_ASSERT
(
desc
->
check_guards
());
return
*
reinterpret_cast
<
const
MemoryBlock
::
Desc
*>
(
block
);
}
...
...
paddle/fluid/operators/activation_op.h
浏览文件 @
d3e140a5
...
...
@@ -100,7 +100,7 @@ class ActivationGradKernel
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
X
);
functor
(
*
place
,
x
,
out
,
dout
,
dx
);
}
else
{
VLOG
(
10
0
)
<<
" Inplace activation "
;
VLOG
(
10
)
<<
" Inplace activation "
;
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dX
);
functor
(
*
place
,
x
,
out
,
dout
,
dx
);
}
...
...
paddle/fluid/operators/array_operator.h
浏览文件 @
d3e140a5
...
...
@@ -49,7 +49,7 @@ class ArrayOp : public framework::OperatorBase {
}
else
{
offset
=
static_cast
<
size_t
>
(
*
i_tensor
.
data
<
int64_t
>
());
}
VLOG
(
10
0
)
<<
" Offset = "
<<
offset
;
VLOG
(
10
)
<<
" Offset = "
<<
offset
;
return
offset
;
}
};
...
...
paddle/fluid/operators/array_to_lod_tensor_op.cc
浏览文件 @
d3e140a5
...
...
@@ -148,8 +148,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
size_t
start_offset
=
lod_and_offset
.
second
.
first
;
size_t
end_offset
=
lod_and_offset
.
second
.
second
;
VLOG
(
10
0
)
<<
"idx="
<<
idx
<<
" x_idx="
<<
x_idx
<<
" ["
<<
", "
<<
end_offset
<<
"]"
;
VLOG
(
10
)
<<
"idx="
<<
idx
<<
" x_idx="
<<
x_idx
<<
" ["
<<
", "
<<
end_offset
<<
"]"
;
// Copy data
PADDLE_ENFORCE_GE
(
end_offset
,
start_offset
);
size_t
len
=
end_offset
-
start_offset
;
...
...
paddle/fluid/operators/batch_norm_op.cu.cc
浏览文件 @
d3e140a5
...
...
@@ -96,7 +96,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
mode_
=
CUDNN_BATCHNORM_SPATIAL
;
#endif
VLOG
(
3
0
)
<<
"Setting descriptors."
;
VLOG
(
3
)
<<
"Setting descriptors."
;
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
strides
;
if
(
data_layout
==
DataLayout
::
kNCHW
)
{
...
...
paddle/fluid/operators/beam_search_op.cc
浏览文件 @
d3e140a5
...
...
@@ -33,11 +33,11 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
auto
items
=
SelectTopBeamSizeItems
(
pre_ids
,
pre_scores
);
auto
selected_items
=
ToMap
(
items
,
high_level
.
back
());
VLOG
(
3
0
)
<<
"selected_items:"
;
VLOG
(
3
)
<<
"selected_items:"
;
for
(
size_t
i
=
0
;
i
<
selected_items
.
size
();
++
i
)
{
VLOG
(
3
0
)
<<
"offset:"
<<
i
;
VLOG
(
3
)
<<
"offset:"
<<
i
;
for
(
auto
&
item
:
selected_items
[
i
])
{
VLOG
(
3
0
)
<<
ItemToString
(
item
);
VLOG
(
3
)
<<
ItemToString
(
item
);
}
}
...
...
@@ -138,11 +138,11 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
}
result
.
emplace_back
(
items
);
}
VLOG
(
3
0
)
<<
"SelectTopBeamSizeItems result size "
<<
result
.
size
();
VLOG
(
3
)
<<
"SelectTopBeamSizeItems result size "
<<
result
.
size
();
for
(
auto
&
items
:
result
)
{
VLOG
(
3
0
)
<<
"item set:"
;
VLOG
(
3
)
<<
"item set:"
;
for
(
auto
&
item
:
items
)
{
VLOG
(
3
0
)
<<
ItemToString
(
item
);
VLOG
(
3
)
<<
ItemToString
(
item
);
}
}
...
...
paddle/fluid/operators/concat_op.cc
浏览文件 @
d3e140a5
...
...
@@ -37,7 +37,7 @@ class ConcatOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_GT
(
n
,
0
,
"Input tensors count should > 0."
);
if
(
n
==
1
)
{
VLOG
(
3
0
)
<<
"Warning: concat op have only one input, may waste memory"
;
VLOG
(
3
)
<<
"Warning: concat op have only one input, may waste memory"
;
}
auto
out_dims
=
ins
[
0
];
...
...
paddle/fluid/operators/controlflow/feed_op.cc
浏览文件 @
d3e140a5
...
...
@@ -47,8 +47,8 @@ class FeedOp : public framework::OperatorBase {
auto
col
=
Attr
<
int
>
(
"col"
);
VLOG
(
3
0
)
<<
"Feed Var "
<<
feed_var_name
<<
"'s "
<<
col
<<
" column to var "
<<
out_name
;
VLOG
(
3
)
<<
"Feed Var "
<<
feed_var_name
<<
"'s "
<<
col
<<
" column to var "
<<
out_name
;
auto
&
feed_list
=
feed_var
->
Get
<
framework
::
FeedFetchList
>
();
auto
&
feed_item
=
feed_list
.
at
(
static_cast
<
size_t
>
(
col
));
...
...
paddle/fluid/operators/controlflow/fetch_op.cc
浏览文件 @
d3e140a5
...
...
@@ -57,7 +57,7 @@ class FetchOp : public framework::OperatorBase {
TensorCopySync
(
src_item
,
platform
::
CPUPlace
(),
&
dst_item
);
dst_item
.
set_lod
(
src_item
.
lod
());
VLOG
(
3
0
)
<<
"Fetch variable "
<<
fetch_var_name
<<
" to "
<<
out_name
;
VLOG
(
3
)
<<
"Fetch variable "
<<
fetch_var_name
<<
" to "
<<
out_name
;
}
};
...
...
paddle/fluid/operators/controlflow/parallel_do_op.cc
浏览文件 @
d3e140a5
...
...
@@ -48,7 +48,7 @@ static void SplitTensorAndMoveTensorToScopes(
auto
lod_tensors
=
tensor
.
SplitLoDTensor
(
places
);
for
(
auto
&
lod
:
lod_tensors
)
{
VLOG
(
3
0
)
<<
lod
.
dims
();
VLOG
(
3
)
<<
lod
.
dims
();
}
if
(
num_sub_scopes
==
0
)
{
num_sub_scopes
=
lod_tensors
.
size
();
...
...
@@ -263,7 +263,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
if
(
s
==
framework
::
kEmptyVarName
)
{
continue
;
}
VLOG
(
3
0
)
<<
"Moving "
<<
s
;
VLOG
(
3
)
<<
"Moving "
<<
s
;
CopyOrShare
(
*
sub_scopes
[
0
]
->
FindVar
(
s
),
place
,
scope
.
FindVar
(
s
));
}
WaitOnPlaces
(
places
);
...
...
@@ -277,7 +277,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
if
(
s
==
framework
::
kEmptyVarName
)
{
continue
;
}
VLOG
(
3
0
)
<<
"Accumulating "
<<
s
;
VLOG
(
3
)
<<
"Accumulating "
<<
s
;
if
(
s
==
framework
::
kEmptyVarName
)
continue
;
std
::
string
tmp_name
;
auto
*
tmp
=
sub_scopes
[
0
]
->
Var
(
&
tmp_name
);
...
...
@@ -289,7 +289,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
auto
sum_op
=
framework
::
OpRegistry
::
CreateOp
(
"sum"
,
{{
"X"
,
{
s
,
tmp_name
}}},
{{
"Out"
,
{
s
}}},
framework
::
AttributeMap
{{
"use_mkldnn"
,
{
false
}}});
VLOG
(
10
0
)
<<
sum_op
->
DebugStringEx
(
sub_scopes
[
0
]);
VLOG
(
10
)
<<
sum_op
->
DebugStringEx
(
sub_scopes
[
0
]);
sum_op
->
Run
(
*
sub_scopes
[
0
],
places
[
0
]);
WaitOnPlace
(
places
[
0
]);
}
...
...
@@ -316,7 +316,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
auto
*
grad
=
new
framework
::
OpDesc
();
grad
->
SetType
(
"parallel_do_grad"
);
for
(
auto
&
input_param
:
this
->
InputNames
())
{
VLOG
(
3
0
)
<<
input_param
;
VLOG
(
3
)
<<
input_param
;
grad
->
SetInput
(
input_param
,
this
->
Input
(
input_param
));
if
(
input_param
!=
kPlaces
)
{
grad
->
SetOutput
(
framework
::
GradVarName
(
input_param
),
...
...
paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
浏览文件 @
d3e140a5
...
...
@@ -34,8 +34,8 @@ class WriteToArrayOp : public ArrayOp {
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
GetMutable
<
framework
::
LoDTensorArray
>
();
if
(
offset
>=
out
->
size
())
{
VLOG
(
10
0
)
<<
"Resize "
<<
Output
(
"Out"
)
<<
" from "
<<
out
->
size
()
<<
" to "
<<
offset
+
1
;
VLOG
(
10
)
<<
"Resize "
<<
Output
(
"Out"
)
<<
" from "
<<
out
->
size
()
<<
" to "
<<
offset
+
1
;
out
->
resize
(
offset
+
1
);
}
auto
*
out_tensor
=
&
out
->
at
(
offset
);
...
...
@@ -47,9 +47,9 @@ class WriteToArrayOp : public ArrayOp {
TensorCopy
(
x_tensor
,
place
,
dev_ctx
,
out_tensor
);
}
else
{
VLOG
(
10
0
)
<<
"WARNING: The input tensor 'x_tensor' holds no memory, so "
"nothing has been written to output array["
<<
offset
<<
"]."
;
VLOG
(
10
)
<<
"WARNING: The input tensor 'x_tensor' holds no memory, so "
"nothing has been written to output array["
<<
offset
<<
"]."
;
}
}
};
...
...
@@ -104,7 +104,7 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
framework
::
BlockDesc
*
block
)
const
override
{
auto
x_name
=
op_desc
.
Input
(
"X"
)[
0
];
auto
out_name
=
op_desc
.
Output
(
"Out"
)[
0
];
VLOG
(
10
0
)
<<
"Set Variable "
<<
out_name
<<
" as LOD_TENSOR_ARRAY"
;
VLOG
(
10
)
<<
"Set Variable "
<<
out_name
<<
" as LOD_TENSOR_ARRAY"
;
auto
&
out
=
block
->
FindRecursiveOrCreateVar
(
out_name
);
out
.
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR_ARRAY
);
auto
*
x
=
block
->
FindVarRecursive
(
x_name
);
...
...
@@ -139,7 +139,7 @@ class ReadFromArrayOp : public ArrayOp {
framework
::
TensorCopy
(
x_array
[
offset
],
place
,
dev_ctx
,
out_tensor
);
out_tensor
->
set_lod
(
x_array
[
offset
].
lod
());
}
else
{
VLOG
(
10
0
)
<<
"offset "
<<
offset
<<
" >= "
<<
x_array
.
size
();
VLOG
(
10
)
<<
"offset "
<<
offset
<<
" >= "
<<
x_array
.
size
();
}
}
};
...
...
paddle/fluid/operators/controlflow/while_op.cc
浏览文件 @
d3e140a5
...
...
@@ -132,15 +132,15 @@ class WhileGradOp : public framework::OperatorBase {
for
(
auto
cur_scope_iter
=
step_scopes
->
rbegin
();
cur_scope_iter
!=
step_scopes
->
rend
();
++
cur_scope_iter
)
{
VLOG
(
3
0
)
<<
"Start backward at time_step "
<<
cur_scope_iter
-
step_scopes
->
rbegin
();
VLOG
(
3
)
<<
"Start backward at time_step "
<<
cur_scope_iter
-
step_scopes
->
rbegin
();
framework
::
Scope
&
cur_scope
=
**
cur_scope_iter
;
// Link OG from outside to inside
for
(
size_t
i
=
0
;
i
<
outside_og_names
.
size
();
++
i
)
{
auto
outside_og_name
=
outside_og_names
[
i
];
auto
inside_og_name
=
inside_og_names
[
i
];
VLOG
(
8
0
)
<<
"Linking outside "
<<
outside_og_name
<<
" --> inside "
<<
inside_og_name
;
VLOG
(
8
)
<<
"Linking outside "
<<
outside_og_name
<<
" --> inside "
<<
inside_og_name
;
if
(
scope
.
FindVar
(
outside_og_name
)
==
nullptr
)
{
continue
;
}
...
...
@@ -162,11 +162,11 @@ class WhileGradOp : public framework::OperatorBase {
auto
&
outside_array
=
og_outside
.
Get
<
framework
::
LoDTensorArray
>
();
auto
&
inside_array
=
detail
::
Ref
(
og_inside
.
GetMutable
<
framework
::
LoDTensorArray
>
());
VLOG
(
8
0
)
<<
outside_og_name
<<
" size = "
<<
outside_array
.
size
();
VLOG
(
8
)
<<
outside_og_name
<<
" size = "
<<
outside_array
.
size
();
inside_array
.
resize
(
outside_array
.
size
());
for
(
size_t
j
=
0
;
j
<
inside_array
.
size
();
++
j
)
{
VLOG
(
8
0
)
<<
j
<<
" "
<<
outside_array
[
j
].
numel
();
VLOG
(
8
)
<<
j
<<
" "
<<
outside_array
[
j
].
numel
();
if
(
outside_array
[
j
].
numel
()
!=
0
)
{
inside_array
[
j
].
set_lod
(
outside_array
[
j
].
lod
());
inside_array
[
j
].
ShareDataWith
(
outside_array
[
j
]);
...
...
@@ -292,7 +292,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
auto
igs
=
InputGrad
(
kX
,
/*do not drop empty gradient*/
false
);
for
(
auto
&
each_ig
:
igs
)
{
if
(
inner_op_outputs
.
find
(
each_ig
)
==
inner_op_outputs
.
end
())
{
VLOG
(
8
0
)
<<
"Ignore "
<<
each_ig
;
VLOG
(
8
)
<<
"Ignore "
<<
each_ig
;
each_ig
=
framework
::
kEmptyVarName
;
}
}
...
...
@@ -356,8 +356,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
auto
&
p_var
=
detail
::
Ref
(
block
->
FindVarRecursive
(
p_names
[
i
]));
auto
*
g_var
=
block
->
FindVarRecursive
(
pg_ig_names
[
i
]);
if
(
g_var
!=
nullptr
)
{
// Gradient could be @EMPTY@
VLOG
(
5
0
)
<<
"Setting "
<<
pg_ig_names
[
i
]
<<
" following "
<<
p_names
[
i
]
<<
" type: "
<<
p_var
.
GetType
();
VLOG
(
5
)
<<
"Setting "
<<
pg_ig_names
[
i
]
<<
" following "
<<
p_names
[
i
]
<<
" type: "
<<
p_var
.
GetType
();
g_var
->
SetType
(
p_var
.
GetType
());
g_var
->
SetDataType
(
p_var
.
GetDataType
());
}
...
...
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
d3e140a5
...
...
@@ -151,11 +151,11 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// Currently tensor core is only enabled using this algo
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
;
half_float
=
true
;
VLOG
(
5
0
)
<<
"use cudnn_tensor_op_math"
;
VLOG
(
5
)
<<
"use cudnn_tensor_op_math"
;
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
VLOG
(
5
0
)
<<
"NOT use cudnn_tensor_op_math"
;
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
}
#endif
...
...
paddle/fluid/operators/distributed/brpc_server.cc
浏览文件 @
d3e140a5
...
...
@@ -133,10 +133,10 @@ void AsyncBRPCServer::StartServer() {
void
AsyncBRPCServer
::
ShutDownImpl
()
{
server_
.
Stop
(
1000
);
}
void
AsyncBRPCServer
::
WaitServerReady
()
{
VLOG
(
3
0
)
<<
"AsyncGRPCServer is wait server ready"
;
VLOG
(
3
)
<<
"AsyncGRPCServer is wait server ready"
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
this
->
mutex_ready_
);
condition_ready_
.
wait
(
lock
,
[
=
]
{
return
this
->
ready_
==
1
;
});
VLOG
(
3
0
)
<<
"AsyncGRPCServer WaitSeverReady"
;
VLOG
(
3
)
<<
"AsyncGRPCServer WaitSeverReady"
;
}
};
// namespace distributed
...
...
paddle/fluid/operators/distributed/grpc_client.cc
浏览文件 @
d3e140a5
...
...
@@ -40,7 +40,7 @@ void GRPCClient::SendComplete() {
std
::
unique_lock
<
std
::
mutex
>
lk
(
completed_mutex_
);
if
(
!
completed_
)
{
for
(
auto
&
it
:
channels_
)
{
VLOG
(
3
0
)
<<
"send complete message to "
<<
it
.
first
;
VLOG
(
3
)
<<
"send complete message to "
<<
it
.
first
;
this
->
AsyncSendComplete
(
it
.
first
);
}
PADDLE_ENFORCE
(
this
->
Wait
(),
"internal grpc error"
);
...
...
@@ -83,7 +83,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
::
grpc
::
ByteBuffer
req
;
SerializeToByteBuffer
(
var_name_val
,
var
,
*
p_ctx
,
&
req
,
""
,
trainer_id_
);
VLOG
(
3
0
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
VLOG
(
3
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
// stub context
s
->
response_call_back_
=
nullptr
;
...
...
@@ -144,7 +144,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
::
grpc
::
ByteBuffer
buf
;
RequestToByteBuffer
<
sendrecv
::
VariableMessage
>
(
req
,
&
buf
);
VLOG
(
3
0
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
VLOG
(
3
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
// stub context
s
->
response_call_back_
=
ProcGetResponse
;
...
...
@@ -192,7 +192,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
::
grpc
::
ByteBuffer
req
;
SerializeToByteBuffer
(
in_var_name_val
,
var
,
*
p_ctx
,
&
req
,
out_var_name_val
);
VLOG
(
3
0
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
VLOG
(
3
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
// stub context
s
->
response_call_back_
=
ProcGetResponse
;
...
...
@@ -330,14 +330,14 @@ void GRPCClient::Proceed() {
void
*
tag
=
nullptr
;
bool
ok
=
false
;
VLOG
(
3
0
)
<<
"GRPCClient Proceed begin"
;
VLOG
(
3
)
<<
"GRPCClient Proceed begin"
;
while
(
!
stopped_
&&
cq_
.
Next
(
&
tag
,
&
ok
))
{
BaseProcessor
*
c
=
static_cast
<
BaseProcessor
*>
(
tag
);
GPR_ASSERT
(
ok
);
PADDLE_ENFORCE
(
c
);
if
(
c
->
status_
.
ok
())
{
VLOG
(
3
0
)
<<
c
->
GetVarHandlePtr
()
->
String
()
<<
" process"
;
VLOG
(
3
)
<<
c
->
GetVarHandlePtr
()
->
String
()
<<
" process"
;
c
->
Process
();
}
else
if
(
c
->
status_
.
error_code
()
==
grpc
::
StatusCode
::
DEADLINE_EXCEEDED
)
{
// FIXME(gongwb): parse error_details?
...
...
@@ -372,7 +372,7 @@ void GRPCClient::Proceed() {
sync_cond_
.
notify_all
();
}
}
VLOG
(
3
0
)
<<
"GRPCClient Proceed end"
;
VLOG
(
3
)
<<
"GRPCClient Proceed end"
;
}
std
::
shared_ptr
<
grpc
::
Channel
>
GRPCClient
::
GetChannel
(
const
std
::
string
&
ep
)
{
...
...
paddle/fluid/operators/distributed/grpc_server.cc
浏览文件 @
d3e140a5
...
...
@@ -100,7 +100,7 @@ class RequestSend final : public RequestBase {
void
Process
()
override
{
std
::
string
varname
=
GetReqName
();
VLOG
(
4
0
)
<<
"RequestSend var_name:"
<<
varname
;
VLOG
(
4
)
<<
"RequestSend var_name:"
<<
varname
;
auto
scope
=
request_
->
GetMutableLocalScope
();
auto
invar
=
request_
->
GetVar
();
...
...
@@ -137,7 +137,7 @@ class RequestGet final : public RequestBase {
// proc request.
std
::
string
varname
=
request_
.
varname
();
int
trainer_id
=
request_
.
trainer_id
();
VLOG
(
4
0
)
<<
"RequestGet "
<<
varname
;
VLOG
(
4
)
<<
"RequestGet "
<<
varname
;
auto
scope
=
request_handler_
->
scope
();
auto
invar
=
scope
->
FindVar
(
varname
);
...
...
@@ -184,8 +184,8 @@ class RequestPrefetch final : public RequestBase {
std
::
string
in_var_name
=
request_
->
Varname
();
std
::
string
out_var_name
=
request_
->
OutVarname
();
int
trainer_id
=
request_
->
GetTrainerId
();
VLOG
(
4
0
)
<<
"RequestPrefetch, in_var_name: "
<<
in_var_name
<<
" out_var_name: "
<<
out_var_name
;
VLOG
(
4
)
<<
"RequestPrefetch, in_var_name: "
<<
in_var_name
<<
" out_var_name: "
<<
out_var_name
;
auto
scope
=
request_
->
GetMutableLocalScope
();
auto
invar
=
scope
->
FindVar
(
in_var_name
);
...
...
@@ -233,8 +233,8 @@ class RequestCheckpointNotify final : public RequestBase {
std
::
string
checkpoint_dir
=
request_
->
OutVarname
();
int
trainer_id
=
request_
->
GetTrainerId
();
VLOG
(
4
0
)
<<
"RequestCheckpointNotify notify: "
<<
checkpoint_notify
<<
", dir: "
<<
checkpoint_dir
;
VLOG
(
4
)
<<
"RequestCheckpointNotify notify: "
<<
checkpoint_notify
<<
", dir: "
<<
checkpoint_dir
;
request_handler_
->
Handle
(
checkpoint_notify
,
scope
,
nullptr
,
nullptr
,
trainer_id
,
checkpoint_dir
);
...
...
@@ -248,10 +248,10 @@ class RequestCheckpointNotify final : public RequestBase {
};
void
AsyncGRPCServer
::
WaitServerReady
()
{
VLOG
(
4
0
)
<<
"AsyncGRPCServer is wait server ready"
;
VLOG
(
4
)
<<
"AsyncGRPCServer is wait server ready"
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
this
->
mutex_ready_
);
condition_ready_
.
wait
(
lock
,
[
=
]
{
return
this
->
ready_
==
1
;
});
VLOG
(
4
0
)
<<
"AsyncGRPCServer WaitSeverReady"
;
VLOG
(
4
)
<<
"AsyncGRPCServer WaitSeverReady"
;
}
// Define an option subclass in order to disable SO_REUSEPORT for the
...
...
@@ -302,15 +302,14 @@ void AsyncGRPCServer::StartServer() {
reqs
.
reserve
(
kRequestBufSize
);
for
(
int
i
=
0
;
i
<
kRequestBufSize
;
i
++
)
{
VLOG
(
60
)
<<
"TryToRegisterNewOne on RPC NAME: "
<<
rpc_name
<<
" I: "
<<
i
;
VLOG
(
6
)
<<
"TryToRegisterNewOne on RPC NAME: "
<<
rpc_name
<<
" I: "
<<
i
;
TryToRegisterNewOne
(
rpc_name
,
i
);
}
for
(
int
i
=
0
;
i
<
threadnum
;
i
++
)
{
rpc_threads_
[
rpc_name
].
emplace_back
(
new
std
::
thread
(
std
::
bind
(
&
AsyncGRPCServer
::
HandleRequest
,
this
,
cq
.
get
(),
rpc_name
,
f
)));
VLOG
(
4
0
)
<<
t
.
first
<<
" creates threads!"
;
VLOG
(
4
)
<<
t
.
first
<<
" creates threads!"
;
}
}
...
...
@@ -327,7 +326,7 @@ void AsyncGRPCServer::StartServer() {
auto
&
threads
=
t
.
second
;
for
(
size_t
i
=
0
;
i
<
threads
.
size
();
++
i
)
{
threads
[
i
]
->
join
();
VLOG
(
4
0
)
<<
t
.
first
<<
" threads ends!"
;
VLOG
(
4
)
<<
t
.
first
<<
" threads ends!"
;
}
}
}
...
...
@@ -335,7 +334,7 @@ void AsyncGRPCServer::StartServer() {
void
AsyncGRPCServer
::
ShutdownQueue
()
{
for
(
auto
&
t
:
rpc_cq_
)
{
t
.
second
->
Shutdown
();
VLOG
(
4
0
)
<<
t
.
first
<<
" queue shutdown!"
;
VLOG
(
4
)
<<
t
.
first
<<
" queue shutdown!"
;
}
}
...
...
@@ -344,7 +343,7 @@ void AsyncGRPCServer::ShutDownImpl() {
is_shut_down_
=
true
;
ShutdownQueue
();
VLOG
(
4
0
)
<<
"server_ shutdown!"
;
VLOG
(
4
)
<<
"server_ shutdown!"
;
server_
->
Shutdown
();
}
...
...
@@ -352,12 +351,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
int
req_id
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
cq_mutex_
);
if
(
is_shut_down_
)
{
VLOG
(
4
0
)
<<
"shutdown, do not TryToRegisterNewSendOne"
;
VLOG
(
4
)
<<
"shutdown, do not TryToRegisterNewSendOne"
;
return
;
}
VLOG
(
4
0
)
<<
"TryToRegisterNewOne on RPC NAME: "
<<
rpc_name
<<
" REQ ID: "
<<
req_id
;
VLOG
(
4
)
<<
"TryToRegisterNewOne on RPC NAME: "
<<
rpc_name
<<
" REQ ID: "
<<
req_id
;
auto
&
reqs
=
rpc_reqs_
[
rpc_name
];
auto
&
handler
=
rpc_call_map_
[
rpc_name
];
...
...
@@ -378,7 +377,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
reqs
[
req_id
]
=
b
;
VLOG
(
4
0
)
<<
"Create RequestSend status:"
<<
b
->
Status
();
VLOG
(
4
)
<<
"Create RequestSend status:"
<<
b
->
Status
();
}
void
AsyncGRPCServer
::
HandleRequest
(
...
...
@@ -388,15 +387,15 @@ void AsyncGRPCServer::HandleRequest(
bool
ok
=
false
;
while
(
true
)
{
VLOG
(
4
0
)
<<
"HandleRequest "
<<
rpc_name
<<
" wait next"
;
VLOG
(
4
)
<<
"HandleRequest "
<<
rpc_name
<<
" wait next"
;
if
(
!
cq
->
Next
(
&
tag
,
&
ok
))
{
VLOG
(
3
0
)
<<
"CompletionQueue "
<<
rpc_name
<<
" shutdown!"
;
VLOG
(
3
)
<<
"CompletionQueue "
<<
rpc_name
<<
" shutdown!"
;
break
;
}
int
req_id
=
static_cast
<
int
>
(
reinterpret_cast
<
intptr_t
>
(
tag
));
VLOG
(
4
0
)
<<
"HandleRequest "
<<
rpc_name
<<
", req_id:"
<<
req_id
<<
" get next"
;
VLOG
(
4
)
<<
"HandleRequest "
<<
rpc_name
<<
", req_id:"
<<
req_id
<<
" get next"
;
auto
&
reqs
=
rpc_reqs_
[
rpc_name
];
RequestBase
*
base
=
nullptr
;
...
...
@@ -406,7 +405,7 @@ void AsyncGRPCServer::HandleRequest(
base
=
reqs
[
req_id
];
}
VLOG
(
3
0
)
<<
base
->
Status2String
(
rpc_name
);
VLOG
(
3
)
<<
base
->
Status2String
(
rpc_name
);
// reference:
// https://github.com/tensorflow/tensorflow/issues/5596
...
...
paddle/fluid/operators/distributed/request_handler.h
浏览文件 @
d3e140a5
...
...
@@ -75,7 +75,7 @@ class VarHandle {
wait_cond_
.
wait
(
lk
,
[
this
]
{
return
status_
!=
kDefaultState
;
});
ret
=
status_
;
}
VLOG
(
7
0
)
<<
"VarHandle wait:"
<<
ret
;
VLOG
(
7
)
<<
"VarHandle wait:"
<<
ret
;
return
ret
!=
kErrorState
;
}
...
...
@@ -84,7 +84,7 @@ class VarHandle {
std
::
unique_lock
<
std
::
mutex
>
lk
(
sync_mutex_
);
status_
=
ok
?
kFinishState
:
kErrorState
;
}
VLOG
(
7
0
)
<<
"VarHandle finish:"
<<
ok
;
VLOG
(
7
)
<<
"VarHandle finish:"
<<
ok
;
wait_cond_
.
notify_all
();
}
...
...
paddle/fluid/operators/distributed/request_handler_impl.cc
浏览文件 @
d3e140a5
...
...
@@ -38,19 +38,19 @@ bool RequestSendHandler::Handle(const std::string& varname,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
)
{
VLOG
(
4
0
)
<<
"RequestSendHandler:"
<<
varname
;
VLOG
(
4
)
<<
"RequestSendHandler:"
<<
varname
;
// Sync
if
(
varname
==
BATCH_BARRIER_MESSAGE
)
{
VLOG
(
3
0
)
<<
"sync: recv BATCH_BARRIER_MESSAGE"
;
VLOG
(
3
)
<<
"sync: recv BATCH_BARRIER_MESSAGE"
;
rpc_server_
->
IncreaseBatchBarrier
(
kRequestSend
);
}
else
if
(
varname
==
COMPLETE_MESSAGE
)
{
VLOG
(
3
0
)
<<
"sync: recv complete message"
;
VLOG
(
3
)
<<
"sync: recv complete message"
;
rpc_server_
->
Complete
();
}
else
{
// Async
if
(
!
sync_mode_
)
{
VLOG
(
3
0
)
<<
"async process var: "
<<
varname
;
VLOG
(
3
)
<<
"async process var: "
<<
varname
;
try
{
executor_
->
RunPreparedContext
((
*
grad_to_prepared_ctx_
)[
varname
].
get
(),
scope
);
...
...
@@ -61,7 +61,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
return
true
;
}
else
{
// sync
rpc_server_
->
WaitCond
(
kRequestSend
);
VLOG
(
3
0
)
<<
"sync: processing received var: "
<<
varname
;
VLOG
(
3
)
<<
"sync: processing received var: "
<<
varname
;
if
(
invar
==
nullptr
)
{
LOG
(
FATAL
)
<<
"sync: Can not find server side var: "
<<
varname
;
...
...
@@ -78,10 +78,10 @@ bool RequestGetHandler::Handle(const std::string& varname,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
)
{
VLOG
(
4
0
)
<<
"RequestGetHandler:"
<<
varname
;
VLOG
(
4
)
<<
"RequestGetHandler:"
<<
varname
;
if
(
sync_mode_
)
{
if
(
varname
==
FETCH_BARRIER_MESSAGE
)
{
VLOG
(
3
0
)
<<
"sync: recv fetch barrier message"
;
VLOG
(
3
)
<<
"sync: recv fetch barrier message"
;
rpc_server_
->
IncreaseBatchBarrier
(
kRequestGet
);
}
else
{
rpc_server_
->
WaitCond
(
kRequestGet
);
...
...
@@ -93,14 +93,13 @@ bool RequestGetHandler::Handle(const std::string& varname,
// NOTE: the format is determined by distributed_transpiler.py
std
::
string
param_bak_name
=
string
::
Sprintf
(
"%s.trainer_%d_bak"
,
varname
,
trainer_id
);
VLOG
(
30
)
<<
"getting "
<<
param_bak_name
<<
" trainer_id "
<<
trainer_id
;
VLOG
(
3
)
<<
"getting "
<<
param_bak_name
<<
" trainer_id "
<<
trainer_id
;
auto
var
=
scope_
->
FindVar
(
varname
);
auto
t_orig
=
var
->
Get
<
framework
::
LoDTensor
>
();
auto
param_bak
=
scope_
->
Var
(
param_bak_name
);
auto
t
=
param_bak
->
GetMutable
<
framework
::
LoDTensor
>
();
t
->
mutable_data
(
dev_ctx_
->
GetPlace
(),
t_orig
.
type
());
VLOG
(
3
0
)
<<
"copying "
<<
varname
<<
" to "
<<
param_bak_name
;
VLOG
(
3
)
<<
"copying "
<<
varname
<<
" to "
<<
param_bak_name
;
framework
::
TensorCopy
(
t_orig
,
dev_ctx_
->
GetPlace
(),
t
);
}
*
outvar
=
scope_
->
FindVar
(
varname
);
...
...
@@ -115,7 +114,7 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
)
{
VLOG
(
4
0
)
<<
"RequestPrefetchHandler "
<<
varname
;
VLOG
(
4
)
<<
"RequestPrefetchHandler "
<<
varname
;
auto
var_desc
=
program_
->
Block
(
0
).
FindVar
(
out_var_name
);
InitializeVariable
(
*
outvar
,
var_desc
->
GetType
());
...
...
@@ -139,8 +138,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
auto
*
lt_var
=
scope_
->
FindVar
(
LOOKUP_TABLE_PATH
)
->
GetMutable
<
std
::
string
>
();
lt_var
->
clear
();
lt_var
->
append
(
out_var_name
);
VLOG
(
4
0
)
<<
"RequestCheckpointHandler update var kLookupTablePath to: "
<<
out_var_name
;
VLOG
(
4
)
<<
"RequestCheckpointHandler update var kLookupTablePath to: "
<<
out_var_name
;
executor_
->
RunPreparedContext
(
checkpoint_prepared_ctx_
.
get
(),
scope_
);
return
true
;
}
...
...
paddle/fluid/operators/distributed/rpc_server.cc
浏览文件 @
d3e140a5
...
...
@@ -39,7 +39,7 @@ void RPCServer::SavePort() const {
port_file
.
open
(
file_path
);
port_file
<<
selected_port_
;
port_file
.
close
();
VLOG
(
4
0
)
<<
"selected port written to "
<<
file_path
;
VLOG
(
4
)
<<
"selected port written to "
<<
file_path
;
}
void
RPCServer
::
WaitBarrier
(
const
std
::
string
&
rpc_name
)
{
...
...
@@ -49,12 +49,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
exit_flag_
.
load
());
});
VLOG
(
3
0
)
<<
"batch_barrier_: "
<<
rpc_name
<<
" "
<<
barrier_counter_
[
rpc_name
];
VLOG
(
3
)
<<
"batch_barrier_: "
<<
rpc_name
<<
" "
<<
barrier_counter_
[
rpc_name
];
}
void
RPCServer
::
IncreaseBatchBarrier
(
const
std
::
string
rpc_name
)
{
VLOG
(
4
0
)
<<
"RPCServer begin IncreaseBatchBarrier "
<<
rpc_name
;
VLOG
(
4
)
<<
"RPCServer begin IncreaseBatchBarrier "
<<
rpc_name
;
int
b
=
0
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
b
=
++
barrier_counter_
[
rpc_name
];
...
...
@@ -71,7 +71,7 @@ void RPCServer::Complete() {
client_num_
--
;
need_reset_all_vars_
=
true
;
VLOG
(
4
0
)
<<
"decrease client_num to: "
<<
client_num_
;
VLOG
(
4
)
<<
"decrease client_num to: "
<<
client_num_
;
if
(
cur_cond_
.
load
()
==
rpc_cond_map_
[
kRequestGet
])
{
barrier_counter_
[
kRequestGet
]
--
;
}
...
...
@@ -90,7 +90,7 @@ int RPCServer::GetClientNum() {
}
void
RPCServer
::
ResetBarrierCounter
()
{
VLOG
(
3
0
)
<<
"RPCServer ResetBarrierCounter "
;
VLOG
(
3
)
<<
"RPCServer ResetBarrierCounter "
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
for
(
auto
&
t
:
barrier_counter_
)
{
t
.
second
=
0
;
...
...
@@ -105,12 +105,12 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,
static
int
cond
=
-
1
;
rpc_cond_map_
[
rpc_name
]
=
++
cond
;
VLOG
(
4
0
)
<<
"RegisterRPC rpc_name:"
<<
rpc_name
<<
", handler:"
<<
handler
<<
", cond:"
<<
rpc_cond_map_
[
rpc_name
];
VLOG
(
4
)
<<
"RegisterRPC rpc_name:"
<<
rpc_name
<<
", handler:"
<<
handler
<<
", cond:"
<<
rpc_cond_map_
[
rpc_name
];
}
void
RPCServer
::
SetCond
(
const
std
::
string
&
rpc_name
)
{
VLOG
(
3
0
)
<<
"RPCServer SetCond "
<<
rpc_name
;
VLOG
(
3
)
<<
"RPCServer SetCond "
<<
rpc_name
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
cur_cond_
=
rpc_cond_map_
[
rpc_name
];
...
...
@@ -120,7 +120,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
}
void
RPCServer
::
WaitCond
(
const
std
::
string
&
rpc_name
)
{
VLOG
(
4
0
)
<<
"RPCServer WaitCond "
<<
rpc_name
;
VLOG
(
4
)
<<
"RPCServer WaitCond "
<<
rpc_name
;
int
cond
=
0
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
...
...
paddle/fluid/operators/distributed/variable_response.cc
浏览文件 @
d3e140a5
...
...
@@ -50,7 +50,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
size_to_write
=
length
-
total_written
;
}
// This log is useful to see how long a internal block size is of rpc.
VLOG
(
7
0
)
<<
"copy "
<<
size_to_write
<<
" data to CUDAPlace"
;
VLOG
(
7
)
<<
"copy "
<<
size_to_write
<<
" data to CUDAPlace"
;
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place
),
reinterpret_cast
<
void
*>
(
p
),
cpu
,
data
,
size_to_write
,
gpu_dev_ctx
.
stream
());
...
...
@@ -79,7 +79,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
// TODO(gongwb): can we avoid copy?
platform
::
CPUPlace
cpu
;
// This log is useful to see how long a internal block size is of rpc.
VLOG
(
7
0
)
<<
"copy "
<<
size_to_write
<<
" data to CPUPlace"
;
VLOG
(
7
)
<<
"copy "
<<
size_to_write
<<
" data to CPUPlace"
;
memory
::
Copy
(
cpu
,
reinterpret_cast
<
void
*>
(
p
),
cpu
,
data
,
size_to_write
);
p
+=
size_to_write
;
...
...
@@ -198,8 +198,8 @@ bool VariableResponse::ProcSerializedField(
#endif
}
VLOG
(
7
0
)
<<
"ProcSerializedField:"
<<
meta_
.
varname
()
<<
", type:"
<<
meta_
.
type
()
<<
std
::
endl
;
VLOG
(
7
)
<<
"ProcSerializedField:"
<<
meta_
.
varname
()
<<
", type:"
<<
meta_
.
type
()
<<
std
::
endl
;
framework
::
DDim
dims
=
GetDims
(
meta_
.
dims
());
if
(
meta_
.
type
()
==
sendrecv
::
LOD_TENSOR
)
{
PADDLE_ENFORCE
(
meta_
.
lod_size
()
>=
0
,
"lod info should be got first!"
);
...
...
paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
浏览文件 @
d3e140a5
...
...
@@ -46,8 +46,8 @@ class CheckpointNotifyOp : public framework::OperatorBase {
auto
lookup_table_save_dir
=
string
::
Sprintf
(
"%s/%s_%d"
,
dir
,
lookup_table_name
,
i
);
rpc_client
->
AsyncCheckpointNotify
(
epmap
[
i
],
lookup_table_save_dir
);
VLOG
(
3
0
)
<<
"checkpoint notify sending lookup table: "
<<
lookup_table_name
<<
" and dir:"
<<
dir
<<
" to "
<<
epmap
[
i
];
VLOG
(
3
)
<<
"checkpoint notify sending lookup table: "
<<
lookup_table_name
<<
" and dir:"
<<
dir
<<
" to "
<<
epmap
[
i
];
}
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
...
...
paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
浏览文件 @
d3e140a5
...
...
@@ -43,7 +43,7 @@ class FetchBarrierOp : public framework::OperatorBase {
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
for
(
auto
&
ep
:
eps
)
{
VLOG
(
3
0
)
<<
"fetch barrier, ep: "
<<
ep
;
VLOG
(
3
)
<<
"fetch barrier, ep: "
<<
ep
;
rpc_client
->
AsyncSendFetchBarrier
(
ep
);
}
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
...
...
paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
浏览文件 @
d3e140a5
...
...
@@ -64,7 +64,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
(
0
);
for
(
auto
&
ep
:
endpoint_list
)
{
VLOG
(
3
0
)
<<
"sending nccl id to "
<<
ep
;
VLOG
(
3
)
<<
"sending nccl id to "
<<
ep
;
client
->
AsyncSendVar
(
ep
,
dev_ctx
,
*
scope
,
NCCL_ID_VARNAME
);
}
client
->
Wait
();
...
...
@@ -72,7 +72,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
client
->
AsyncSendBatchBarrier
(
ep
);
}
client
->
Wait
();
VLOG
(
3
0
)
<<
"sending completed..."
;
VLOG
(
3
)
<<
"sending completed..."
;
}
void
GetIdByServer
(
framework
::
Scope
*
scope
,
...
...
@@ -99,11 +99,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
std
::
bind
(
&
distributed
::
RPCServer
::
StartServer
,
rpc_service
.
get
()));
rpc_service
->
SetCond
(
distributed
::
kRequestSend
);
VLOG
(
3
0
)
<<
"start getting nccl id from trainer 0..."
;
VLOG
(
3
)
<<
"start getting nccl id from trainer 0..."
;
rpc_service
->
WaitBarrier
(
distributed
::
kRequestSend
);
VLOG
(
3
0
)
<<
"got nccl id and stop server..."
;
VLOG
(
3
)
<<
"got nccl id and stop server..."
;
rpc_service
->
ShutDown
();
VLOG
(
3
0
)
<<
"rpc server stopped"
;
VLOG
(
3
)
<<
"rpc server stopped"
;
server_thread
.
join
();
}
};
...
...
paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
浏览文件 @
d3e140a5
...
...
@@ -36,7 +36,7 @@ namespace operators {
void
RunServer
(
std
::
shared_ptr
<
distributed
::
RPCServer
>
service
)
{
service
->
StartServer
();
VLOG
(
4
0
)
<<
"RunServer thread end"
;
VLOG
(
4
)
<<
"RunServer thread end"
;
}
static
void
split
(
const
std
::
string
&
str
,
char
sep
,
std
::
vector
<
std
::
string
>
*
pieces
)
{
...
...
@@ -66,8 +66,8 @@ static void ParallelExecuteBlocks(
fs
.
push_back
(
framework
::
Async
([
&
executor
,
&
prepared
,
&
scope
,
idx
]()
{
int
run_block
=
idx
;
// thread local
try
{
VLOG
(
3
0
)
<<
"running server block: "
<<
run_block
<<
"pointer: "
<<
prepared
[
run_block
].
get
();
VLOG
(
3
)
<<
"running server block: "
<<
run_block
<<
"pointer: "
<<
prepared
[
run_block
].
get
();
executor
->
RunPreparedContext
(
prepared
[
run_block
].
get
(),
scope
);
}
catch
(
const
std
::
exception
&
e
)
{
LOG
(
FATAL
)
<<
"run sub program:"
<<
idx
<<
" error "
<<
e
.
what
();
...
...
@@ -108,7 +108,7 @@ void ListenAndServOp::RunSyncLoop(
framework
::
Scope
*
recv_scope
,
platform
::
DeviceContext
*
dev_ctx
,
const
std
::
vector
<
int
>
&
prefetch_block_id_list
,
const
int
checkpoint_point_block_id
)
const
{
VLOG
(
2
0
)
<<
"RunSyncLoop"
;
VLOG
(
2
)
<<
"RunSyncLoop"
;
size_t
num_blocks
=
program
->
Size
();
auto
optimize_blocks
=
Attr
<
std
::
vector
<
framework
::
BlockDesc
*>>
(
kOptimizeBlocks
);
...
...
@@ -167,7 +167,7 @@ void ListenAndServOp::RunSyncLoop(
}
ParallelExecuteBlocks
(
parallel_blkids
,
executor
,
optimize_prepared
,
program
,
recv_scope
);
VLOG
(
2
0
)
<<
"run all blocks spent "
<<
GetTimestamp
()
-
ts
<<
"(ms)"
;
VLOG
(
2
)
<<
"run all blocks spent "
<<
GetTimestamp
()
-
ts
<<
"(ms)"
;
ResetReceivedVars
(
recv_scope
,
dev_ctx
,
rpc_service_
->
NeedResetAllVars
());
...
...
@@ -183,11 +183,11 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
for
(
auto
&
varname
:
sparse_vars_
)
{
auto
var
=
recv_scope
->
FindVar
(
varname
);
if
(
var
==
nullptr
)
{
VLOG
(
2
0
)
<<
"can not find var "
<<
varname
<<
" in received scope"
;
VLOG
(
2
)
<<
"can not find var "
<<
varname
<<
" in received scope"
;
continue
;
}
if
(
var
->
IsType
<
framework
::
SelectedRows
>
())
{
VLOG
(
3
0
)
<<
"reset sparse var: "
<<
varname
;
VLOG
(
3
)
<<
"reset sparse var: "
<<
varname
;
var
->
GetMutable
<
framework
::
SelectedRows
>
()
->
mutable_rows
()
->
clear
();
}
else
{
PADDLE_THROW
(
"The type of sparse var should be SelectedRows"
);
...
...
@@ -197,7 +197,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
for
(
auto
&
varname
:
dense_vars_
)
{
auto
var
=
recv_scope
->
FindVar
(
varname
);
if
(
var
==
nullptr
)
{
VLOG
(
2
0
)
<<
"can not find var "
<<
varname
<<
" in received scope"
;
VLOG
(
2
)
<<
"can not find var "
<<
varname
<<
" in received scope"
;
continue
;
}
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
...
...
@@ -216,7 +216,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
void
ListenAndServOp
::
RunAsyncLoop
(
framework
::
Executor
*
executor
,
framework
::
ProgramDesc
*
program
,
framework
::
Scope
*
recv_scope
)
const
{
VLOG
(
2
0
)
<<
"RunAsyncLoop"
;
VLOG
(
2
)
<<
"RunAsyncLoop"
;
auto
grad_to_block_id_str
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"grad_to_block_id"
);
DoubleFindMap
<
std
::
string
,
int32_t
>
grad_to_block_id
;
...
...
@@ -225,7 +225,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
const
std
::
string
&
grad_and_id
)
{
std
::
vector
<
std
::
string
>
pieces
;
split
(
grad_and_id
,
':'
,
&
pieces
);
VLOG
(
3
0
)
<<
"after split, key = "
<<
pieces
[
0
]
<<
", id="
<<
pieces
[
1
];
VLOG
(
3
)
<<
"after split, key = "
<<
pieces
[
0
]
<<
", id="
<<
pieces
[
1
];
PADDLE_ENFORCE_EQ
(
pieces
.
size
(),
2
);
PADDLE_ENFORCE_EQ
(
out_map
->
count
(
pieces
[
0
]),
0
);
...
...
@@ -270,7 +270,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
while
(
true
)
{
if
(
rpc_service_
->
IsExit
())
{
VLOG
(
4
0
)
<<
"get exit!rpc_processor break!"
;
VLOG
(
4
)
<<
"get exit!rpc_processor break!"
;
break
;
}
...
...
@@ -332,9 +332,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
std
::
string
endpoint
=
Attr
<
std
::
string
>
(
"endpoint"
);
int
checkpoint_block_id
=
Attr
<
int
>
(
kCheckpointBlockId
);
VLOG
(
4
0
)
<<
"sync_mode:"
<<
sync_mode
<<
", fan_in:"
<<
fan_in
<<
", end_point:"
<<
endpoint
<<
", checkpoint_block_id: "
<<
checkpoint_block_id
;
VLOG
(
4
)
<<
"sync_mode:"
<<
sync_mode
<<
", fan_in:"
<<
fan_in
<<
", end_point:"
<<
endpoint
<<
", checkpoint_block_id: "
<<
checkpoint_block_id
;
rpc_service_
.
reset
(
new
RPCSERVER_T
(
endpoint
,
fan_in
));
...
...
@@ -383,8 +383,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
prefetch_var_name_to_block_id_str
)
{
std
::
vector
<
std
::
string
>
pieces
;
split
(
prefetch_var_name_and_id
,
':'
,
&
pieces
);
VLOG
(
3
0
)
<<
"after split, prefetch_var = "
<<
pieces
[
0
]
<<
", id="
<<
pieces
[
1
];
VLOG
(
3
)
<<
"after split, prefetch_var = "
<<
pieces
[
0
]
<<
", id="
<<
pieces
[
1
];
PADDLE_ENFORCE_EQ
(
pieces
.
size
(),
2
);
int
block_id
=
std
::
stoi
(
pieces
[
1
]);
...
...
@@ -415,7 +415,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
// start the server listening after all member initialized.
server_thread_
.
reset
(
new
std
::
thread
(
RunServer
,
rpc_service_
));
VLOG
(
3
0
)
<<
"wait server thread to become ready..."
;
VLOG
(
3
)
<<
"wait server thread to become ready..."
;
rpc_service_
->
WaitServerReady
();
// register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
...
...
paddle/fluid/operators/distributed_ops/prefetch_op.cc
浏览文件 @
d3e140a5
...
...
@@ -48,12 +48,12 @@ class PrefetchOp : public framework::OperatorBase {
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
i
++
)
{
if
(
NeedSend
(
scope
,
ins
[
i
]))
{
VLOG
(
3
0
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
]
<<
" to get "
<<
outs
[
i
]
<<
" back"
;
VLOG
(
3
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
]
<<
" to get "
<<
outs
[
i
]
<<
" back"
;
rets
.
push_back
(
rpc_client
->
AsyncPrefetchVar
(
epmap
[
i
],
ctx
,
scope
,
ins
[
i
],
outs
[
i
]));
}
else
{
VLOG
(
3
0
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
VLOG
(
3
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
}
}
for
(
size_t
i
=
0
;
i
<
rets
.
size
();
i
++
)
{
...
...
paddle/fluid/operators/distributed_ops/recv_op.cc
浏览文件 @
d3e140a5
...
...
@@ -47,7 +47,7 @@ class RecvOp : public framework::OperatorBase {
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
for
(
size_t
i
=
0
;
i
<
outs
.
size
();
i
++
)
{
VLOG
(
3
0
)
<<
"getting "
<<
outs
[
i
]
<<
" from "
<<
epmap
[
i
];
VLOG
(
3
)
<<
"getting "
<<
outs
[
i
]
<<
" from "
<<
epmap
[
i
];
rets
.
push_back
(
rpc_client
->
AsyncGetVar
(
epmap
[
i
],
ctx
,
scope
,
outs
[
i
]));
}
if
(
sync_mode
)
{
...
...
paddle/fluid/operators/distributed_ops/send_barrier_op.cc
浏览文件 @
d3e140a5
...
...
@@ -42,12 +42,12 @@ class SendBarrierOp : public framework::OperatorBase {
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
(
Attr
<
int
>
(
"trainer_id"
));
VLOG
(
3
0
)
<<
"SendBarrierOp sync"
;
VLOG
(
3
)
<<
"SendBarrierOp sync"
;
// need to wait before sending send_barrier message
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
for
(
auto
&
ep
:
eps
)
{
VLOG
(
3
0
)
<<
"send barrier, ep: "
<<
ep
;
VLOG
(
3
)
<<
"send barrier, ep: "
<<
ep
;
rpc_client
->
AsyncSendBatchBarrier
(
ep
);
}
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
...
...
paddle/fluid/operators/distributed_ops/send_op.cc
浏览文件 @
d3e140a5
...
...
@@ -50,10 +50,10 @@ class SendOp : public framework::OperatorBase {
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
i
++
)
{
if
(
NeedSend
(
scope
,
ins
[
i
]))
{
VLOG
(
3
0
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
];
VLOG
(
3
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
];
rets
.
push_back
(
rpc_client
->
AsyncSendVar
(
epmap
[
i
],
ctx
,
scope
,
ins
[
i
]));
}
else
{
VLOG
(
3
0
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
VLOG
(
3
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
}
}
if
(
sync_send
)
{
...
...
paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
浏览文件 @
d3e140a5
...
...
@@ -120,7 +120,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
void
StartServerNet
(
bool
is_sparse
,
std
::
atomic
<
bool
>
*
initialized
)
{
f
::
Scope
scope
;
p
::
CPUPlace
place
;
VLOG
(
4
0
)
<<
"before init tensor"
;
VLOG
(
4
)
<<
"before init tensor"
;
if
(
is_sparse
)
{
InitSelectedRowsInScope
(
place
,
&
scope
);
}
else
{
...
...
@@ -146,7 +146,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
attrs
.
insert
({
"PrefetchBlock"
,
prefetch_block
});
attrs
.
insert
({
"grad_to_block_id"
,
std
::
vector
<
std
::
string
>
({
""
})});
attrs
.
insert
({
"sync_mode"
,
true
});
VLOG
(
4
0
)
<<
"before init op"
;
VLOG
(
4
)
<<
"before init op"
;
listen_and_serv_op
=
f
::
OpRegistry
::
CreateOp
(
"listen_and_serv"
,
{{
"X"
,
{
"x1"
}}},
{},
attrs
);
*
initialized
=
true
;
...
...
paddle/fluid/operators/distributed_ops/split_byref_op.h
浏览文件 @
d3e140a5
...
...
@@ -32,7 +32,7 @@ class SplitByrefOpKernel : public framework::OpKernel<T> {
for
(
size_t
i
=
0
;
i
<
outs
.
size
();
++
i
)
{
// NOTE: no need to call mutable_data here to allocate memory.
auto
*
out
=
outs
[
i
];
VLOG
(
3
0
)
<<
"spliting by ref: "
<<
row_offset
<<
" "
<<
out
->
dims
()[
0
];
VLOG
(
3
)
<<
"spliting by ref: "
<<
row_offset
<<
" "
<<
out
->
dims
()[
0
];
*
out
=
in
->
Slice
(
row_offset
,
row_offset
+
out
->
dims
()[
0
]);
row_offset
+=
out
->
dims
()[
0
];
}
...
...
paddle/fluid/operators/distributed_ops/split_ids_op.h
浏览文件 @
d3e140a5
...
...
@@ -44,7 +44,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
for
(
size_t
i
=
0
;
i
<
ids_tensors
.
size
();
++
i
)
{
batch_size
+=
ids_tensors
[
i
]
->
dims
()[
0
];
}
VLOG
(
4
0
)
<<
"Get Total BatchSize is: "
<<
batch_size
;
VLOG
(
4
)
<<
"Get Total BatchSize is: "
<<
batch_size
;
std
::
vector
<
T
>
all_ids
(
batch_size
);
int
offset
=
0
;
...
...
paddle/fluid/operators/lod_rank_table_op.cc
浏览文件 @
d3e140a5
...
...
@@ -30,9 +30,9 @@ class LoDRankTableOp : public framework::OperatorBase {
auto
x
=
scope
.
FindVar
(
Input
(
"X"
))
->
Get
<
framework
::
LoDTensor
>
();
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
GetMutable
<
framework
::
LoDRankTable
>
();
VLOG
(
10
0
)
<<
"Level = "
<<
static_cast
<
size_t
>
(
Attr
<
int
>
(
"level"
));
VLOG
(
10
)
<<
"Level = "
<<
static_cast
<
size_t
>
(
Attr
<
int
>
(
"level"
));
out
->
Reset
(
x
.
lod
(),
static_cast
<
size_t
>
(
Attr
<
int
>
(
"level"
)));
VLOG
(
10
0
)
<<
Input
(
"X"
)
<<
"'s lod information is "
<<
*
out
;
VLOG
(
10
)
<<
Input
(
"X"
)
<<
"'s lod information is "
<<
*
out
;
}
};
...
...
paddle/fluid/operators/lookup_table_op.cc
浏览文件 @
d3e140a5
...
...
@@ -134,13 +134,13 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
auto
attr
=
op_desc
.
GetAttr
(
"is_sparse"
);
bool
is_sparse
=
boost
::
get
<
bool
>
(
attr
);
if
(
is_sparse
)
{
VLOG
(
3
0
)
<<
"lookup_table_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to SelectedRows"
;
VLOG
(
3
)
<<
"lookup_table_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to SelectedRows"
;
block
->
Var
(
out_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
}
else
{
VLOG
(
3
0
)
<<
"lookup_table_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to LoDTensor"
;
VLOG
(
3
)
<<
"lookup_table_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to LoDTensor"
;
block
->
Var
(
out_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
block
->
Var
(
out_var_name
)
->
SetDataType
(
block
->
Var
(
"W"
)
->
GetDataType
());
...
...
paddle/fluid/operators/math/cpu_vec_test.cc
浏览文件 @
d3e140a5
...
...
@@ -96,8 +96,8 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
}
auto
et
=
GetCurrentUS
();
VLOG
(
3
0
)
<<
"Vec size "
<<
n
<<
": refer takes: "
<<
(
et
-
mt
)
/
repeat
<<
" us, tgt takes: "
<<
(
mt
-
st
)
/
repeat
;
VLOG
(
3
)
<<
"Vec size "
<<
n
<<
": refer takes: "
<<
(
et
-
mt
)
/
repeat
<<
" us, tgt takes: "
<<
(
mt
-
st
)
/
repeat
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
EXPECT_NEAR
(
ytgt_data
[
i
],
yref_data
[
i
],
1e-3
);
}
...
...
paddle/fluid/operators/math/jit_kernel_test.cc
浏览文件 @
d3e140a5
...
...
@@ -86,7 +86,7 @@ TEST(JitKernel, vrelu) {
vrelu_intri8
(
d
,
x_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
VLOG
(
3
0
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
<<
" us"
;
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
<<
" us"
;
}
#endif
auto
ttgts
=
GetCurrentUS
();
...
...
@@ -94,9 +94,8 @@ TEST(JitKernel, vrelu) {
ker
->
Compute
(
x_data
,
ztgt_data
,
d
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -127,9 +126,8 @@ TEST(JitKernel, vaddbias) {
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -175,14 +173,14 @@ TEST(JitKernel, vexp) {
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -235,10 +233,9 @@ TEST(JitKernel, vsigmoid) {
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -296,10 +293,9 @@ TEST(JitKernel, vtanh) {
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -397,10 +393,9 @@ TEST(JitKernel, lstm) {
ker
->
ComputeCtHt
(
&
step
,
&
attr
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
}
}
...
...
@@ -473,8 +468,8 @@ TEST(JitKernel, vscal) {
vscal_inp_intri8
(
d
,
a
,
y_data
);
}
auto
si3
=
GetCurrentUS
();
VLOG
(
3
0
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
<<
" us, inplace: "
<<
(
si3
-
si2
)
/
repeat
<<
" us"
;
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
<<
" us, inplace: "
<<
(
si3
-
si2
)
/
repeat
<<
" us"
;
}
#endif
...
...
@@ -488,18 +483,15 @@ TEST(JitKernel, vscal) {
ker
->
Compute
(
&
a
,
y_data
,
y_data
,
d
);
}
auto
ttgte1
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, inplace takes: "
<<
(
trefe1
-
trefs1
)
/
repeat
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, inplace takes: "
<<
(
trefe1
-
trefs1
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl inplace takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
" us, mkl inplace takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
"us, tgt inplace takes: "
<<
(
ttgte1
-
ttgts1
)
/
repeat
<<
" us"
;
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
"us, tgt inplace takes: "
<<
(
ttgte1
-
ttgts1
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -557,7 +549,7 @@ TEST(JitKernel, vmul) {
vmul_intri8
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
VLOG
(
3
0
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
}
#endif
...
...
@@ -567,14 +559,13 @@ TEST(JitKernel, vmul) {
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -632,7 +623,7 @@ TEST(JitKernel, vadd) {
vadd_intri8
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
VLOG
(
3
0
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
}
#endif
...
...
@@ -642,14 +633,13 @@ TEST(JitKernel, vadd) {
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -699,10 +689,9 @@ TEST(JitKernel, vaddrelu) {
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
,
d
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
" us"
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
paddle/fluid/operators/math/selected_rows_functor.cc
浏览文件 @
d3e140a5
...
...
@@ -270,7 +270,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
const
std
::
vector
<
const
framework
::
SelectedRows
*>&
inputs
,
framework
::
SelectedRows
*
output
)
{
if
(
inputs
.
size
()
==
0
)
{
VLOG
(
3
0
)
<<
"no input! return"
;
VLOG
(
3
)
<<
"no input! return"
;
return
;
}
const
framework
::
SelectedRows
*
has_value_input
=
nullptr
;
...
...
@@ -281,7 +281,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
}
}
if
(
has_value_input
==
nullptr
)
{
VLOG
(
3
0
)
<<
"no input has value! just return"
<<
std
::
endl
;
VLOG
(
3
)
<<
"no input has value! just return"
<<
std
::
endl
;
return
;
}
auto
input_width
=
has_value_input
->
value
().
dims
()[
1
];
...
...
paddle/fluid/operators/math/selected_rows_functor.cu
浏览文件 @
d3e140a5
...
...
@@ -314,7 +314,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
const
std
::
vector
<
const
framework
::
SelectedRows
*>&
inputs
,
framework
::
SelectedRows
*
output
)
{
if
(
inputs
.
size
()
==
0
)
{
VLOG
(
3
0
)
<<
"no input! return"
;
VLOG
(
3
)
<<
"no input! return"
;
return
;
}
const
framework
::
SelectedRows
*
has_value_input
=
nullptr
;
...
...
@@ -325,7 +325,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
}
}
if
(
has_value_input
==
nullptr
)
{
VLOG
(
3
0
)
<<
"no input has value! just return"
<<
std
::
endl
;
VLOG
(
3
)
<<
"no input has value! just return"
<<
std
::
endl
;
return
;
}
auto
input_width
=
has_value_input
->
value
().
dims
()[
1
];
...
...
paddle/fluid/operators/mul_op.cc
浏览文件 @
d3e140a5
...
...
@@ -38,9 +38,9 @@ class MulOp : public framework::OperatorWithKernel {
int
x_num_col_dims
=
ctx
->
Attrs
().
Get
<
int
>
(
"x_num_col_dims"
);
int
y_num_col_dims
=
ctx
->
Attrs
().
Get
<
int
>
(
"y_num_col_dims"
);
VLOG
(
3
0
)
<<
"mul operator x.shape="
<<
x_dims
<<
" y.shape="
<<
y_dims
<<
" x_num_col_dims="
<<
x_num_col_dims
<<
" y_num_col_dims="
<<
y_num_col_dims
;
VLOG
(
3
)
<<
"mul operator x.shape="
<<
x_dims
<<
" y.shape="
<<
y_dims
<<
" x_num_col_dims="
<<
x_num_col_dims
<<
" y_num_col_dims="
<<
y_num_col_dims
;
PADDLE_ENFORCE_GT
(
x_dims
.
size
(),
x_num_col_dims
,
...
...
paddle/fluid/operators/nccl/nccl_op.cu.cc
浏览文件 @
d3e140a5
...
...
@@ -63,16 +63,16 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
// device id
int
gpu_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
()).
GetDeviceId
();
int
idx
=
comm
->
GetCommId
(
gpu_id
);
VLOG
(
3
0
)
<<
"gpu : "
<<
" invoke allreduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
VLOG
(
3
)
<<
"gpu : "
<<
" invoke allreduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
x
->
data
<
T
>
(),
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
out
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
VLOG
(
3
0
)
<<
"gpu : "
<<
" finished allreduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
VLOG
(
3
)
<<
"gpu : "
<<
" finished allreduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
}
};
...
...
@@ -109,14 +109,14 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
}
else
{
out
->
Resize
(
framework
::
make_ddim
({
0
}));
}
VLOG
(
3
0
)
<<
"gpu : "
<<
gpu_id
<<
" invoke reduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke reduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclReduce
(
x
->
data
<
T
>
(),
recvbuffer
,
x
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
root
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
VLOG
(
3
0
)
<<
"gpu : "
<<
gpu_id
<<
" finished reduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" finished reduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
}
};
...
...
@@ -133,22 +133,21 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
int
idx
=
comm
->
GetCommId
(
gpu_id
);
if
(
idx
==
root
)
{
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
VLOG
(
3
0
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. send "
<<
x
->
numel
();
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. send "
<<
x
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
x
->
data
<
T
>
())),
x
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
VLOG
(
3
0
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast."
;
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast."
;
}
else
{
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
VLOG
(
3
0
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. recv buffer "
<<
framework
::
product
(
out
->
dims
());
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. recv buffer "
<<
framework
::
product
(
out
->
dims
());
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
out
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
VLOG
(
30
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast. recv "
<<
out
->
numel
();
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast. recv "
<<
out
->
numel
();
}
}
};
...
...
paddle/fluid/operators/nccl/nccl_op_test.cu.cc
浏览文件 @
d3e140a5
...
...
@@ -86,9 +86,9 @@ class NCCLTester : public ::testing::Test {
(
*
p_scopes
).
resize
(
gpu_list_
.
size
());
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op1
);
VLOG
(
1
0
)
<<
"invoke NCCLInitOp."
;
VLOG
(
1
)
<<
"invoke NCCLInitOp."
;
op
->
Run
(
g_scope_
,
cpu_place
);
VLOG
(
1
0
)
<<
"NCCLInitOp finished."
;
VLOG
(
1
)
<<
"NCCLInitOp finished."
;
}
int
GetGPUData
(
int
gpu_id
)
{
return
gpu_id
+
42
;
}
...
...
@@ -109,7 +109,7 @@ class NCCLTester : public ::testing::Test {
std
::
vector
<
T
>
send_vector
(
f
::
product
(
kDims
),
GetGPUData
(
gpu_id
));
paddle
::
framework
::
TensorFromVector
<
T
>
(
send_vector
,
*
ctx
,
send_tensor
);
VLOG
(
1
0
)
<<
"Send Tensor filled with elements "
<<
send_tensor
->
numel
();
VLOG
(
1
)
<<
"Send Tensor filled with elements "
<<
send_tensor
->
numel
();
}
lk
.
unlock
();
...
...
@@ -119,11 +119,11 @@ class NCCLTester : public ::testing::Test {
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op1
);
VLOG
(
1
0
)
<<
"Device : "
<<
gpu_id
<<
" invoke "
<<
op_desc
.
Type
();
VLOG
(
1
0
)
<<
" send_tensor : "
<<
send_tensor
->
numel
()
<<
" recv_tensor : "
<<
recv_tensor
->
numel
();
VLOG
(
1
)
<<
"Device : "
<<
gpu_id
<<
" invoke "
<<
op_desc
.
Type
();
VLOG
(
1
)
<<
" send_tensor : "
<<
send_tensor
->
numel
()
<<
" recv_tensor : "
<<
recv_tensor
->
numel
();
op
->
Run
(
*
scope
,
place
);
VLOG
(
1
0
)
<<
"Device : "
<<
gpu_id
<<
" finished "
<<
op_desc
.
Type
();
VLOG
(
1
)
<<
"Device : "
<<
gpu_id
<<
" finished "
<<
op_desc
.
Type
();
}
public:
...
...
paddle/fluid/operators/nce_op.cc
浏览文件 @
d3e140a5
...
...
@@ -162,9 +162,9 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
"user should avoid setting this attribute."
)
.
SetDefault
({});
AddComment
(
R"DOC(
Compute and return the noise-contrastive estimation training loss. See
`Noise-contrastive estimation: A new estimation principle for unnormalized
statistical models
Compute and return the noise-contrastive estimation training loss. See
`Noise-contrastive estimation: A new estimation principle for unnormalized
statistical models
<http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_.
By default this operator uses a uniform distribution for sampling.
)DOC"
);
...
...
@@ -230,14 +230,14 @@ class NCEOpGradVarTypeInference : public framework::VarTypeInference {
auto
attr
=
op_desc
.
GetAttr
(
"is_sparse"
);
bool
is_sparse
=
boost
::
get
<
bool
>
(
attr
);
if
(
is_sparse
)
{
VLOG
(
3
0
)
<<
"nce_op_grad op "
<<
weight_grad
<<
" and "
<<
bias_grad
<<
" is set to SelectedRows"
;
VLOG
(
3
)
<<
"nce_op_grad op "
<<
weight_grad
<<
" and "
<<
bias_grad
<<
" is set to SelectedRows"
;
block
->
Var
(
weight_grad
)
->
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
block
->
Var
(
bias_grad
)
->
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
}
else
{
VLOG
(
3
0
)
<<
"nce_op_grad op "
<<
weight_grad
<<
" and "
<<
bias_grad
<<
" is set to LoDTensor"
;
VLOG
(
3
)
<<
"nce_op_grad op "
<<
weight_grad
<<
" and "
<<
bias_grad
<<
" is set to LoDTensor"
;
block
->
Var
(
weight_grad
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
block
->
Var
(
bias_grad
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
...
...
paddle/fluid/operators/optimizers/adam_op.h
浏览文件 @
d3e140a5
...
...
@@ -297,7 +297,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
auto
&
grad
=
Ref
(
ctx
.
Input
<
framework
::
SelectedRows
>
(
"Grad"
),
"Must set Grad"
);
if
(
grad
.
rows
().
size
()
==
0
)
{
VLOG
(
3
0
)
<<
"grad row size is 0!!"
;
VLOG
(
3
)
<<
"grad row size is 0!!"
;
return
;
}
...
...
paddle/fluid/operators/optimizers/momentum_op.h
浏览文件 @
d3e140a5
...
...
@@ -346,7 +346,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
// sparse update maybe empty.
if
(
grad
->
rows
().
size
()
==
0
)
{
VLOG
(
3
0
)
<<
"Grad SelectedRows contains no data!"
;
VLOG
(
3
)
<<
"Grad SelectedRows contains no data!"
;
return
;
}
auto
*
merged_grad
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
...
...
paddle/fluid/operators/optimizers/sgd_op.h
浏览文件 @
d3e140a5
...
...
@@ -98,10 +98,10 @@ class SGDOpKernel : public framework::OpKernel<T> {
auto
param_row_width
=
param
.
value
().
dims
()[
1
];
auto
grad_row_width
=
grad
.
value
().
dims
()[
1
];
VLOG
(
4
0
)
<<
" param rows: "
<<
param
.
rows
().
size
()
<<
" param memory rows: "
<<
param
.
value
().
dims
()[
0
]
<<
" grad rows: "
<<
grad
.
rows
().
size
()
<<
" grad memory rows: "
<<
grad
.
value
().
dims
()[
0
];
VLOG
(
4
)
<<
" param rows: "
<<
param
.
rows
().
size
()
<<
" param memory rows: "
<<
param
.
value
().
dims
()[
0
]
<<
" grad rows: "
<<
grad
.
rows
().
size
()
<<
" grad memory rows: "
<<
grad
.
value
().
dims
()[
0
];
PADDLE_ENFORCE_EQ
(
param_row_width
,
grad_row_width
,
"param_row should have the same size with grad_row"
);
...
...
paddle/fluid/operators/random_crop_op.h
浏览文件 @
d3e140a5
...
...
@@ -155,8 +155,8 @@ class RandomCropKernel : public framework::OpKernel<T> {
seed
=
*
cpu_seed
.
data
<
int64_t
>
();
}
}
else
{
VLOG
(
5
0
)
<<
"WARNING: The input 'Seed' is not initialized, use attribute "
"'startup_seed' instead."
;
VLOG
(
5
)
<<
"WARNING: The input 'Seed' is not initialized, use attribute "
"'startup_seed' instead."
;
seed
=
ctx
.
Attr
<
int
>
(
"startup_seed"
);
}
auto
shape
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"shape"
);
...
...
paddle/fluid/operators/reader/blocking_queue.h
浏览文件 @
d3e140a5
...
...
@@ -42,7 +42,7 @@ class BlockingQueue {
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
send_cv_
.
wait
(
lock
,
[
&
]
{
return
queue_
.
size
()
<
capacity_
||
closed_
;
});
if
(
closed_
)
{
VLOG
(
5
0
)
VLOG
(
5
)
<<
"WARNING: Sending an element to a closed reader::BlokcingQueue."
;
return
false
;
}
...
...
@@ -56,7 +56,7 @@ class BlockingQueue {
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
send_cv_
.
wait
(
lock
,
[
&
]
{
return
queue_
.
size
()
<
capacity_
||
closed_
;
});
if
(
closed_
)
{
VLOG
(
5
0
)
VLOG
(
5
)
<<
"WARNING: Sending an element to a closed reader::BlokcingQueue."
;
return
false
;
}
...
...
paddle/fluid/operators/reader/create_shuffle_reader_op.cc
浏览文件 @
d3e140a5
...
...
@@ -26,7 +26,7 @@ class ShuffleReader : public framework::DecoratedReader {
ShuffleReader
(
const
std
::
shared_ptr
<
ReaderBase
>&
reader
,
size_t
buffer_size
,
size_t
seed
=
0
)
:
DecoratedReader
(
reader
),
buffer_size_
(
buffer_size
),
seed_
(
seed
)
{
VLOG
(
10
0
)
<<
"Create shuffle reader of "
<<
reader_
;
VLOG
(
10
)
<<
"Create shuffle reader of "
<<
reader_
;
if
(
seed_
==
0
)
{
std
::
random_device
device
;
seed_
=
device
();
...
...
@@ -37,7 +37,7 @@ class ShuffleReader : public framework::DecoratedReader {
void
ReadNextImpl
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
override
{
out
->
clear
();
if
(
iteration_pos_
>=
buffer_
.
size
())
{
VLOG
(
10
0
)
<<
"Resetting shuffle buffer"
;
VLOG
(
10
)
<<
"Resetting shuffle buffer"
;
ReloadBuffer
();
if
(
buffer_
.
empty
())
{
return
;
...
...
@@ -73,7 +73,7 @@ class ShuffleReader : public framework::DecoratedReader {
std
::
mt19937
g
(
seed_
);
std
::
shuffle
(
buffer_
.
begin
(),
buffer_
.
end
(),
g
);
seed_
=
g
();
// update seed_;
VLOG
(
10
0
)
<<
"random buffer size = "
<<
buffer_
.
size
();
VLOG
(
10
)
<<
"random buffer size = "
<<
buffer_
.
size
();
}
size_t
buffer_size_
;
...
...
paddle/fluid/operators/recurrent_op.cc
浏览文件 @
d3e140a5
...
...
@@ -160,7 +160,7 @@ class RecurrentBase : public framework::OperatorBase {
Callback
callback
)
{
PADDLE_ENFORCE_EQ
(
src_vars
.
size
(),
dst_vars
.
size
());
for
(
size_t
i
=
0
;
i
<
dst_vars
.
size
();
++
i
)
{
VLOG
(
10
0
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
VLOG
(
10
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
AccessTensor
(
src_scope
,
src_vars
[
i
],
dst_scope
,
dst_vars
[
i
],
callback
);
}
}
...
...
@@ -176,7 +176,7 @@ class RecurrentBase : public framework::OperatorBase {
Callback
callback
)
{
PADDLE_ENFORCE_EQ
(
src_vars
.
size
(),
dst_vars
.
size
());
for
(
size_t
i
=
0
;
i
<
dst_vars
.
size
();
++
i
)
{
VLOG
(
10
0
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
VLOG
(
10
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
AccessTensor
(
src_scope
,
src_vars
[
i
],
dst_scope
,
dst_vars
[
i
],
callback
);
}
}
...
...
@@ -230,7 +230,7 @@ class RecurrentOp : public RecurrentBase {
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{
auto
seq_len
=
static_cast
<
size_t
>
(
this
->
GetSequenceLength
(
scope
));
VLOG
(
3
0
)
<<
"Static RNN input sequence length = "
<<
seq_len
;
VLOG
(
3
)
<<
"Static RNN input sequence length = "
<<
seq_len
;
StepScopes
scopes
=
CreateStepScopes
(
scope
,
seq_len
);
auto
reverse
=
Attr
<
bool
>
(
kReverse
);
...
...
@@ -241,7 +241,7 @@ class RecurrentOp : public RecurrentBase {
for
(
size_t
i
=
0
;
i
<
seq_len
;
++
i
)
{
size_t
seq_offset
=
reverse
?
seq_len
-
i
-
1
:
i
;
VLOG
(
3
0
)
<<
"Recurrent operate at the time step "
<<
seq_offset
;
VLOG
(
3
)
<<
"Recurrent operate at the time step "
<<
seq_offset
;
auto
&
cur_scope
=
scopes
.
CurScope
();
...
...
@@ -334,7 +334,7 @@ class RecurrentGradOp : public RecurrentBase {
for
(
size_t
step_id
=
0
;
step_id
<
seq_len
;
++
step_id
)
{
size_t
seq_offset
=
reverse
?
step_id
:
seq_len
-
step_id
-
1
;
VLOG
(
3
0
)
<<
"Recurrent backward operate at the time step "
<<
seq_offset
;
VLOG
(
3
)
<<
"Recurrent backward operate at the time step "
<<
seq_offset
;
auto
&
cur_scope
=
scopes
.
CurScope
();
// Link outside::output_grads --> inside::output_grads
// inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
...
...
@@ -348,11 +348,11 @@ class RecurrentGradOp : public RecurrentBase {
});
auto
og_set
=
List2Set
(
Inputs
(
kOutputGrads
));
if
(
VLOG_IS_ON
(
10
0
))
{
if
(
VLOG_IS_ON
(
10
))
{
std
::
ostringstream
sout
;
std
::
copy
(
og_set
.
begin
(),
og_set
.
end
(),
std
::
ostream_iterator
<
std
::
string
>
(
sout
,
","
));
VLOG
(
10
0
)
<<
" RNN output gradients = ["
<<
sout
.
str
()
<<
"]"
;
VLOG
(
10
)
<<
" RNN output gradients = ["
<<
sout
.
str
()
<<
"]"
;
}
// Link states
...
...
@@ -374,7 +374,7 @@ class RecurrentGradOp : public RecurrentBase {
auto
&
ex_tensor
=
ex_scope
.
FindVar
(
ex_grad
)
->
Get
<
framework
::
LoDTensor
>
();
VLOG
(
10
0
)
<<
" RNN link "
<<
cur_grad
<<
" from "
<<
ex_grad
;
VLOG
(
10
)
<<
" RNN link "
<<
cur_grad
<<
" from "
<<
ex_grad
;
auto
*
cur_grad_var
=
cur_scope
.
Var
(
cur_grad
);
auto
cur_grad_tensor
=
cur_grad_var
->
GetMutable
<
framework
::
LoDTensor
>
();
...
...
@@ -382,12 +382,12 @@ class RecurrentGradOp : public RecurrentBase {
}
}
VLOG
(
5
0
)
<<
"Recurrent memory linking finished "
;
VLOG
(
5
)
<<
"Recurrent memory linking finished "
;
// Run step block with cur_scope
executor
.
Run
(
*
program
,
&
cur_scope
,
block
->
ID
(),
false
/*create_local_scope*/
);
VLOG
(
5
0
)
<<
"executor.Run finished "
;
VLOG
(
5
)
<<
"executor.Run finished "
;
auto
local_var_names
=
LocalVarNames
(
cur_scope
);
...
...
@@ -436,7 +436,7 @@ class RecurrentGradOp : public RecurrentBase {
cur_scope
.
Rename
(
new_inside_name
,
inside_grad_name
);
}
}
VLOG
(
5
0
)
<<
"Accumulate Parameter finished "
;
VLOG
(
5
)
<<
"Accumulate Parameter finished "
;
// Copy input gradient from inside to outside
// outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad
...
...
@@ -455,7 +455,7 @@ class RecurrentGradOp : public RecurrentBase {
auto
dst
=
outside
->
Slice
(
seq_offset
,
seq_offset
+
1
);
framework
::
TensorCopy
(
inside
,
place
,
dev_ctx
,
&
dst
);
});
VLOG
(
5
0
)
<<
"Link outside gradient finished "
;
VLOG
(
5
)
<<
"Link outside gradient finished "
;
if
(
step_id
+
1
==
seq_len
)
{
// at_end
// copy initialize states gradient from inside to outside
...
...
@@ -468,7 +468,7 @@ class RecurrentGradOp : public RecurrentBase {
outside
->
mutable_data
(
place
,
inside
.
type
());
framework
::
TensorCopy
(
inside
,
place
,
dev_ctx
,
outside
);
});
VLOG
(
5
0
)
<<
"Link initialize state gradient finished "
;
VLOG
(
5
)
<<
"Link initialize state gradient finished "
;
}
scopes
.
Next
();
}
...
...
paddle/fluid/operators/rnn_memory_helper_op.cc
浏览文件 @
d3e140a5
...
...
@@ -93,7 +93,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
in_grad_var_name
);
if
(
out_grad_var
==
nullptr
)
{
VLOG
(
5
0
)
<<
"Using fill constant 0 as starting gradient"
;
VLOG
(
5
)
<<
"Using fill constant 0 as starting gradient"
;
auto
in_var_name
=
Input
(
"X"
);
auto
*
in_var
=
scope
.
FindVar
(
in_var_name
);
auto
&
in_var_tensor
=
in_var
->
Get
<
framework
::
LoDTensor
>
();
...
...
paddle/fluid/operators/save_op.cc
浏览文件 @
d3e140a5
...
...
@@ -110,7 +110,7 @@ class SaveOp : public framework::OperatorBase {
lt_var
!=
nullptr
,
"Can not find variable kLookupTablePath for SaveSelectedRows"
);
std
::
string
filename
=
lt_var
->
data
();
VLOG
(
4
0
)
<<
"SaveSelectedRows get File name: "
<<
filename
;
VLOG
(
4
)
<<
"SaveSelectedRows get File name: "
<<
filename
;
MkDirRecursively
(
DirName
(
filename
).
c_str
());
...
...
paddle/fluid/operators/sequence_ops/sequence_mask_op.h
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
paddle/fluid/operators/sum_mkldnn_op.cc
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
paddle/fluid/operators/sum_op.cc
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
paddle/fluid/platform/device_tracer.cc
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
paddle/fluid/platform/dynload/dynamic_loader.cc
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
paddle/fluid/platform/gpu_info.cc
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
paddle/fluid/platform/init.cc
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
paddle/fluid/platform/nccl_helper.h
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
paddle/fluid/pybind/protobuf.cc
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
paddle/fluid/train/demo/demo_trainer.cc
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
paddle/legacy/pserver/ParameterClient2.cpp
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
paddle/testing/TestUtil.cpp
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
浏览文件 @
d3e140a5
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录