Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
d231e550
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d231e550
编写于
11月 12, 2018
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
差异文件
merge develop
test=develop
上级
cf8d2e67
080740b3
变更
171
隐藏空白更改
内联
并排
Showing
171 changed file
with
3318 addition
and
912 deletion
+3318
-912
paddle/fluid/API.spec
paddle/fluid/API.spec
+5
-2
paddle/fluid/framework/data_device_transform.cc
paddle/fluid/framework/data_device_transform.cc
+2
-2
paddle/fluid/framework/data_device_transform_test.cu
paddle/fluid/framework/data_device_transform_test.cu
+3
-3
paddle/fluid/framework/details/broadcast_op_handle.cc
paddle/fluid/framework/details/broadcast_op_handle.cc
+1
-1
paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
...framework/details/modify_op_lock_and_record_event_pass.cc
+2
-2
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+6
-6
paddle/fluid/framework/details/reference_count_pass.cc
paddle/fluid/framework/details/reference_count_pass.cc
+2
-2
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+1
-1
paddle/fluid/framework/details/sequential_execution_pass.cc
paddle/fluid/framework/details/sequential_execution_pass.cc
+2
-2
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+4
-4
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+11
-11
paddle/fluid/framework/feed_fetch_method.cc
paddle/fluid/framework/feed_fetch_method.cc
+3
-3
paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+14
-14
paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+2
-2
paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+3
-3
paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+2
-2
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
+1
-1
paddle/fluid/framework/ir/fc_fuse_pass.cc
paddle/fluid/framework/ir/fc_fuse_pass.cc
+1
-1
paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+14
-14
paddle/fluid/framework/ir/graph.cc
paddle/fluid/framework/ir/graph.cc
+2
-2
paddle/fluid/framework/ir/graph.h
paddle/fluid/framework/ir/graph.h
+1
-1
paddle/fluid/framework/ir/graph_helper.cc
paddle/fluid/framework/ir/graph_helper.cc
+10
-9
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+12
-10
paddle/fluid/framework/ir/graph_viz_pass.cc
paddle/fluid/framework/ir/graph_viz_pass.cc
+1
-1
paddle/fluid/framework/ir/mkldnn_placement_pass.cc
paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+1
-1
paddle/fluid/framework/ir/multi_batch_merge_pass.cc
paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+4
-4
paddle/fluid/framework/ir/pass.h
paddle/fluid/framework/ir/pass.h
+1
-1
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+6
-6
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+1
-1
paddle/fluid/framework/lod_rank_table.cc
paddle/fluid/framework/lod_rank_table.cc
+1
-1
paddle/fluid/framework/mixed_vector_test.cc
paddle/fluid/framework/mixed_vector_test.cc
+1
-1
paddle/fluid/framework/naive_executor.cc
paddle/fluid/framework/naive_executor.cc
+7
-7
paddle/fluid/framework/op_desc.cc
paddle/fluid/framework/op_desc.cc
+16
-16
paddle/fluid/framework/op_registry.cc
paddle/fluid/framework/op_registry.cc
+3
-3
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+10
-7
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+1
-1
paddle/fluid/framework/scope.cc
paddle/fluid/framework/scope.cc
+1
-1
paddle/fluid/framework/selected_rows.cc
paddle/fluid/framework/selected_rows.cc
+1
-1
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+12
-12
paddle/fluid/framework/threadpool.cc
paddle/fluid/framework/threadpool.cc
+1
-1
paddle/fluid/framework/var_desc.cc
paddle/fluid/framework/var_desc.cc
+14
-14
paddle/fluid/framework/var_type_inference.h
paddle/fluid/framework/var_type_inference.h
+25
-0
paddle/fluid/inference/analysis/analyzer.cc
paddle/fluid/inference/analysis/analyzer.cc
+5
-3
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+2
-2
paddle/fluid/inference/analysis/data_flow_graph.cc
paddle/fluid/inference/analysis/data_flow_graph.cc
+5
-5
paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
...fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+4
-3
paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+1
-1
paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
+1
-1
paddle/fluid/inference/analysis/model_store_pass.cc
paddle/fluid/inference/analysis/model_store_pass.cc
+4
-4
paddle/fluid/inference/analysis/pass_manager.cc
paddle/fluid/inference/analysis/pass_manager.cc
+2
-2
paddle/fluid/inference/analysis/subgraph_splitter.cc
paddle/fluid/inference/analysis/subgraph_splitter.cc
+1
-1
paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+3
-3
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+8
-8
paddle/fluid/inference/api/analysis_predictor.h
paddle/fluid/inference/api/analysis_predictor.h
+2
-0
paddle/fluid/inference/api/api.cc
paddle/fluid/inference/api/api.cc
+0
-1
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+1
-1
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+7
-7
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+4
-4
paddle/fluid/inference/api/demo_ci/utils.h
paddle/fluid/inference/api/demo_ci/utils.h
+5
-5
paddle/fluid/inference/api/demo_ci/vis_demo.cc
paddle/fluid/inference/api/demo_ci/vis_demo.cc
+5
-5
paddle/fluid/inference/api/details/reset_tensor_array.cc
paddle/fluid/inference/api/details/reset_tensor_array.cc
+2
-2
paddle/fluid/inference/api/helper.h
paddle/fluid/inference/api/helper.h
+2
-1
paddle/fluid/inference/io.cc
paddle/fluid/inference/io.cc
+4
-3
paddle/fluid/inference/tensorrt/convert/concat_op.cc
paddle/fluid/inference/tensorrt/convert/concat_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/dropout_op.cc
paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/fc_op.cc
paddle/fluid/inference/tensorrt/convert/fc_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/mul_op.cc
paddle/fluid/inference/tensorrt/convert/mul_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/pad_op.cc
paddle/fluid/inference/tensorrt/convert/pad_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/softmax_op.cc
paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+1
-1
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+1
-1
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+3
-3
paddle/fluid/memory/detail/buddy_allocator.cc
paddle/fluid/memory/detail/buddy_allocator.cc
+28
-28
paddle/fluid/memory/detail/meta_cache.cc
paddle/fluid/memory/detail/meta_cache.cc
+1
-1
paddle/fluid/memory/malloc.cc
paddle/fluid/memory/malloc.cc
+7
-7
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-0
paddle/fluid/operators/activation_op.cc
paddle/fluid/operators/activation_op.cc
+6
-10
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+1
-1
paddle/fluid/operators/adam_op.h
paddle/fluid/operators/adam_op.h
+1
-1
paddle/fluid/operators/add_position_encoding_op.h
paddle/fluid/operators/add_position_encoding_op.h
+4
-3
paddle/fluid/operators/array_operator.h
paddle/fluid/operators/array_operator.h
+1
-1
paddle/fluid/operators/array_to_lod_tensor_op.cc
paddle/fluid/operators/array_to_lod_tensor_op.cc
+2
-2
paddle/fluid/operators/batch_norm_op.cc
paddle/fluid/operators/batch_norm_op.cc
+10
-1
paddle/fluid/operators/batch_norm_op.cu.cc
paddle/fluid/operators/batch_norm_op.cu.cc
+1
-1
paddle/fluid/operators/beam_search_op.cc
paddle/fluid/operators/beam_search_op.cc
+6
-6
paddle/fluid/operators/bilinear_interp_op.h
paddle/fluid/operators/bilinear_interp_op.h
+0
-163
paddle/fluid/operators/checkpoint_notify_op.cc
paddle/fluid/operators/checkpoint_notify_op.cc
+2
-2
paddle/fluid/operators/concat_op.cc
paddle/fluid/operators/concat_op.cc
+1
-1
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+188
-23
paddle/fluid/operators/conv_cudnn_op_cache.h
paddle/fluid/operators/conv_cudnn_op_cache.h
+90
-0
paddle/fluid/operators/conv_op.cc
paddle/fluid/operators/conv_op.cc
+22
-1
paddle/fluid/operators/cross_entropy_op.cc
paddle/fluid/operators/cross_entropy_op.cc
+11
-0
paddle/fluid/operators/distributed/brpc_server.cc
paddle/fluid/operators/distributed/brpc_server.cc
+2
-2
paddle/fluid/operators/distributed/grpc_client.cc
paddle/fluid/operators/distributed/grpc_client.cc
+7
-7
paddle/fluid/operators/distributed/grpc_server.cc
paddle/fluid/operators/distributed/grpc_server.cc
+23
-22
paddle/fluid/operators/distributed/request_handler.h
paddle/fluid/operators/distributed/request_handler.h
+2
-2
paddle/fluid/operators/distributed/request_handler_impl.cc
paddle/fluid/operators/distributed/request_handler_impl.cc
+13
-12
paddle/fluid/operators/distributed/rpc_server.cc
paddle/fluid/operators/distributed/rpc_server.cc
+10
-10
paddle/fluid/operators/distributed/variable_response.cc
paddle/fluid/operators/distributed/variable_response.cc
+4
-4
paddle/fluid/operators/elementwise_op.h
paddle/fluid/operators/elementwise_op.h
+6
-10
paddle/fluid/operators/feed_op.cc
paddle/fluid/operators/feed_op.cc
+2
-2
paddle/fluid/operators/fetch_barrier_op.cc
paddle/fluid/operators/fetch_barrier_op.cc
+1
-1
paddle/fluid/operators/fetch_op.cc
paddle/fluid/operators/fetch_op.cc
+1
-1
paddle/fluid/operators/gen_nccl_id_op.cc
paddle/fluid/operators/gen_nccl_id_op.cc
+5
-5
paddle/fluid/operators/grid_sampler_op.h
paddle/fluid/operators/grid_sampler_op.h
+9
-2
paddle/fluid/operators/interpolate_op.cc
paddle/fluid/operators/interpolate_op.cc
+53
-27
paddle/fluid/operators/interpolate_op.cu
paddle/fluid/operators/interpolate_op.cu
+292
-0
paddle/fluid/operators/interpolate_op.h
paddle/fluid/operators/interpolate_op.h
+236
-0
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+17
-17
paddle/fluid/operators/lod_rank_table_op.cc
paddle/fluid/operators/lod_rank_table_op.cc
+2
-2
paddle/fluid/operators/lookup_table_op.cc
paddle/fluid/operators/lookup_table_op.cc
+4
-4
paddle/fluid/operators/math/cpu_vec_test.cc
paddle/fluid/operators/math/cpu_vec_test.cc
+2
-2
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+50
-39
paddle/fluid/operators/math/selected_rows_functor.cc
paddle/fluid/operators/math/selected_rows_functor.cc
+2
-2
paddle/fluid/operators/math/selected_rows_functor.cu
paddle/fluid/operators/math/selected_rows_functor.cu
+2
-2
paddle/fluid/operators/mean_op.cc
paddle/fluid/operators/mean_op.cc
+19
-2
paddle/fluid/operators/momentum_op.h
paddle/fluid/operators/momentum_op.h
+1
-1
paddle/fluid/operators/mul_op.cc
paddle/fluid/operators/mul_op.cc
+13
-4
paddle/fluid/operators/nccl_op.cu.cc
paddle/fluid/operators/nccl_op.cu.cc
+16
-15
paddle/fluid/operators/nccl_op_test.cu.cc
paddle/fluid/operators/nccl_op_test.cu.cc
+7
-7
paddle/fluid/operators/parallel_do_op.cc
paddle/fluid/operators/parallel_do_op.cc
+5
-5
paddle/fluid/operators/pool_op.cc
paddle/fluid/operators/pool_op.cc
+14
-4
paddle/fluid/operators/prefetch_op.cc
paddle/fluid/operators/prefetch_op.cc
+3
-3
paddle/fluid/operators/random_crop_op.h
paddle/fluid/operators/random_crop_op.h
+2
-2
paddle/fluid/operators/reader/blocking_queue.h
paddle/fluid/operators/reader/blocking_queue.h
+2
-2
paddle/fluid/operators/reader/create_shuffle_reader_op.cc
paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+3
-3
paddle/fluid/operators/recurrent_op.cc
paddle/fluid/operators/recurrent_op.cc
+13
-13
paddle/fluid/operators/recv_op.cc
paddle/fluid/operators/recv_op.cc
+1
-1
paddle/fluid/operators/rnn_memory_helper_op.cc
paddle/fluid/operators/rnn_memory_helper_op.cc
+1
-1
paddle/fluid/operators/save_op.cc
paddle/fluid/operators/save_op.cc
+1
-1
paddle/fluid/operators/send_barrier_op.cc
paddle/fluid/operators/send_barrier_op.cc
+2
-2
paddle/fluid/operators/send_op.cc
paddle/fluid/operators/send_op.cc
+2
-2
paddle/fluid/operators/send_recv_op_test.cc
paddle/fluid/operators/send_recv_op_test.cc
+2
-2
paddle/fluid/operators/sequence_mask_op.h
paddle/fluid/operators/sequence_mask_op.h
+1
-1
paddle/fluid/operators/sgd_op.h
paddle/fluid/operators/sgd_op.h
+4
-4
paddle/fluid/operators/similarity_focus_op.cc
paddle/fluid/operators/similarity_focus_op.cc
+87
-0
paddle/fluid/operators/similarity_focus_op.h
paddle/fluid/operators/similarity_focus_op.h
+168
-0
paddle/fluid/operators/softmax_op.cc
paddle/fluid/operators/softmax_op.cc
+9
-1
paddle/fluid/operators/split_byref_op.h
paddle/fluid/operators/split_byref_op.h
+1
-1
paddle/fluid/operators/split_ids_op.h
paddle/fluid/operators/split_ids_op.h
+1
-1
paddle/fluid/operators/sum_mkldnn_op.cc
paddle/fluid/operators/sum_mkldnn_op.cc
+1
-1
paddle/fluid/operators/sum_op.cc
paddle/fluid/operators/sum_op.cc
+3
-3
paddle/fluid/operators/tensor_array_read_write_op.cc
paddle/fluid/operators/tensor_array_read_write_op.cc
+7
-7
paddle/fluid/operators/tensor_array_to_tensor_op.cc
paddle/fluid/operators/tensor_array_to_tensor_op.cc
+246
-0
paddle/fluid/operators/tensorrt_engine_op.h
paddle/fluid/operators/tensorrt_engine_op.h
+8
-8
paddle/fluid/operators/while_op.cc
paddle/fluid/operators/while_op.cc
+9
-9
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+4
-1
paddle/fluid/platform/device_tracer.cc
paddle/fluid/platform/device_tracer.cc
+4
-4
paddle/fluid/platform/dynload/cudnn.h
paddle/fluid/platform/dynload/cudnn.h
+48
-45
paddle/fluid/platform/dynload/dynamic_loader.cc
paddle/fluid/platform/dynload/dynamic_loader.cc
+2
-2
paddle/fluid/platform/gpu_info.cc
paddle/fluid/platform/gpu_info.cc
+2
-2
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+1
-1
paddle/fluid/platform/nccl_helper.h
paddle/fluid/platform/nccl_helper.h
+1
-1
paddle/fluid/pybind/protobuf.cc
paddle/fluid/pybind/protobuf.cc
+3
-3
paddle/fluid/train/demo/demo_trainer.cc
paddle/fluid/train/demo/demo_trainer.cc
+1
-1
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+22
-3
paddle/testing/TestUtil.cpp
paddle/testing/TestUtil.cpp
+1
-1
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+3
-1
python/paddle/fluid/distribute_lookup_table.py
python/paddle/fluid/distribute_lookup_table.py
+39
-0
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+265
-22
python/paddle/fluid/layers/tensor.py
python/paddle/fluid/layers/tensor.py
+58
-4
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+59
-7
python/paddle/fluid/tests/book/test_label_semantic_roles.py
python/paddle/fluid/tests/book/test_label_semantic_roles.py
+1
-1
python/paddle/fluid/tests/unittests/test_conv2d_op.py
python/paddle/fluid/tests/unittests/test_conv2d_op.py
+9
-1
python/paddle/fluid/tests/unittests/test_conv3d_op.py
python/paddle/fluid/tests/unittests/test_conv3d_op.py
+6
-0
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+11
-2
python/paddle/fluid/tests/unittests/test_interpolate_op.py
python/paddle/fluid/tests/unittests/test_interpolate_op.py
+335
-0
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+10
-0
python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
.../paddle/fluid/tests/unittests/test_similarity_focus_op.py
+217
-0
python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
...ddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
+142
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+4
-27
未找到文件。
paddle/fluid/API.spec
浏览文件 @
d231e550
...
...
@@ -118,9 +118,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon',
paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'
], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR'
))
paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'
, 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None
))
paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
...
...
@@ -178,6 +179,7 @@ paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], vara
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
...
...
@@ -200,6 +202,7 @@ paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'],
paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.tensor_array_to_tensor ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,))
...
...
paddle/fluid/framework/data_device_transform.cc
浏览文件 @
d231e550
...
...
@@ -18,8 +18,8 @@ namespace framework {
void
TransDataDevice
(
const
Tensor
&
in
,
const
platform
::
Place
&
dst_place
,
Tensor
*
out
)
{
VLOG
(
3
)
<<
"DeviceTransform in, src_place "
<<
in
.
place
()
<<
" dst_place: "
<<
dst_place
;
VLOG
(
3
0
)
<<
"DeviceTransform in, src_place "
<<
in
.
place
()
<<
" dst_place: "
<<
dst_place
;
PADDLE_ENFORCE_NE
(
in
.
place
().
which
(),
dst_place
.
which
(),
...
...
paddle/fluid/framework/data_device_transform_test.cu
浏览文件 @
d231e550
...
...
@@ -49,10 +49,10 @@ class TestOpWithKernel : public OperatorWithKernel {
OpKernelType
GetExpectedKernelType
(
const
ExecutionContext
&
ctx
)
const
override
{
if
(
Attr
<
bool
>
(
"use_gpu"
))
{
VLOG
(
3
)
<<
"force use gpu kernel"
;
VLOG
(
3
0
)
<<
"force use gpu kernel"
;
return
OpKernelType
(
proto
::
VarType
::
FP32
,
platform
::
CUDAPlace
(
0
));
}
else
{
VLOG
(
3
)
<<
"use default kernel"
;
VLOG
(
3
0
)
<<
"use default kernel"
;
return
OpKernelType
(
proto
::
VarType
::
FP32
,
ctx
.
Input
<
Tensor
>
(
"input"
)
->
place
());
}
...
...
@@ -148,7 +148,7 @@ TEST(Operator, CPUtoGPU) {
// get output
auto
*
output2
=
scope
.
Var
(
"OUT2"
);
gpu_op
->
Run
(
scope
,
cuda_place
);
VLOG
(
3
)
<<
"after gpu_op run"
;
VLOG
(
3
0
)
<<
"after gpu_op run"
;
// auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
paddle
::
platform
::
DeviceContextPool
&
pool
=
...
...
paddle/fluid/framework/details/broadcast_op_handle.cc
浏览文件 @
d231e550
...
...
@@ -60,7 +60,7 @@ void BroadcastOpHandle::BroadcastOneVar(
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
Tensor
&
in_tensor
=
VariableVisitor
::
GetMutableTensor
(
in_var
);
if
(
UNLIKELY
(
!
in_tensor
.
IsInitialized
()))
{
VLOG
(
3
)
<<
"in var "
<<
in_var_handle
.
name_
<<
"not inited, return!"
;
VLOG
(
3
0
)
<<
"in var "
<<
in_var_handle
.
name_
<<
"not inited, return!"
;
return
;
}
...
...
paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
浏览文件 @
d231e550
...
...
@@ -45,8 +45,8 @@ std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
IsLockAndRecordEventFreeComputationOpHandle
(
compute_op
,
graph_view
);
compute_op
->
SetLockAndRecordEventFree
(
is_lock_and_record_event_free
);
if
(
is_lock_and_record_event_free
)
{
VLOG
(
10
)
<<
"Set is_lock_and_record_event_free be true in op "
<<
compute_op
->
DebugString
();
VLOG
(
10
0
)
<<
"Set is_lock_and_record_event_free be true in op "
<<
compute_op
->
DebugString
();
}
}
return
ir_graph
;
...
...
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
d231e550
...
...
@@ -399,7 +399,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
for
(
size_t
i
=
0
;
i
<
backward_vars
.
size
();
i
+=
2
)
{
auto
&
p_name
=
backward_vars
[
i
];
auto
&
g_name
=
backward_vars
[
i
+
1
];
VLOG
(
10
)
<<
"Bcast "
<<
g_name
<<
" for parameter "
<<
p_name
;
VLOG
(
10
0
)
<<
"Bcast "
<<
g_name
<<
" for parameter "
<<
p_name
;
switch
(
strategy_
.
reduce_
)
{
case
BuildStrategy
::
ReduceStrategy
::
kReduce
:
...
...
@@ -809,8 +809,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleVarAttrName
()));
PADDLE_ENFORCE_EQ
(
send_param_grad
.
size
(),
2U
);
op_dev_id
=
GetAppropriateDeviceID
({
send_param_grad
[
1
]});
VLOG
(
10
)
<<
"send grad "
<<
input_var_names
[
0
]
<<
" origin "
<<
send_param_grad
[
1
]
<<
" place: "
<<
op_dev_id
;
VLOG
(
10
0
)
<<
"send grad "
<<
input_var_names
[
0
]
<<
" origin "
<<
send_param_grad
[
1
]
<<
" place: "
<<
op_dev_id
;
for
(
auto
&
varname
:
input_var_names
)
{
sharded_var_device
->
emplace
(
varname
,
op_dev_id
);
}
...
...
@@ -826,9 +826,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
if
(
recv_param_grad
.
size
()
==
2U
)
{
op_dev_id
=
GetVarDeviceID
(
*
result
,
recv_param_grad
[
1
],
*
sharded_var_device
);
VLOG
(
10
)
<<
"recv param "
<<
recv_param_grad
[
0
]
<<
" get grad place: "
<<
recv_param_grad
[
1
]
<<
" place: "
<<
op_dev_id
;
VLOG
(
10
0
)
<<
"recv param "
<<
recv_param_grad
[
0
]
<<
" get grad place: "
<<
recv_param_grad
[
1
]
<<
" place: "
<<
op_dev_id
;
}
else
{
op_dev_id
=
GetAppropriateDeviceID
(
output_var_names
);
}
...
...
paddle/fluid/framework/details/reference_count_pass.cc
浏览文件 @
d231e550
...
...
@@ -140,8 +140,8 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
if
(
next_compute_op
!=
nullptr
)
{
if
(
compute_ref_cnt_map
.
count
(
next_compute_op
))
{
compute_ref_cnt_map
[
next_compute_op
]
->
AddVar
(
var_name
);
VLOG
(
5
)
<<
"Add reference count of "
<<
var_name
<<
" to Operator "
<<
next_compute_op
->
Name
();
VLOG
(
5
0
)
<<
"Add reference count of "
<<
var_name
<<
" to Operator "
<<
next_compute_op
->
Name
();
}
else
{
// Create new reference_count_op_handle
ir
::
Node
*
ref_cnt_node
=
graph
->
CreateEmptyNode
(
...
...
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
浏览文件 @
d231e550
...
...
@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
->
stream
();
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
tmp
,
platform
::
CPUPlace
(),
&
coeff_
,
sizeof
(
float
),
stream
);
VLOG
(
10
)
<<
place_
<<
"RUN Scale loss grad op"
;
VLOG
(
10
0
)
<<
place_
<<
"RUN Scale loss grad op"
;
});
#endif
}
...
...
paddle/fluid/framework/details/sequential_execution_pass.cc
浏览文件 @
d231e550
...
...
@@ -94,8 +94,8 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
op_node_list
[
i
-
1
]
->
outputs
.
push_back
(
dep_var
);
dep_var
->
outputs
.
push_back
(
op_node_list
[
i
]);
dep_var
->
inputs
.
push_back
(
op_node_list
[
i
-
1
]);
VLOG
(
10
)
<<
"Add dependencies between "
<<
op_node_list
[
i
-
1
]
->
Name
()
<<
" and "
<<
op_node_list
[
i
]
->
Name
();
VLOG
(
10
0
)
<<
"Add dependencies between "
<<
op_node_list
[
i
-
1
]
->
Name
()
<<
" and "
<<
op_node_list
[
i
]
->
Name
();
}
return
graph
;
}
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
d231e550
...
...
@@ -210,16 +210,16 @@ void ThreadedSSAGraphExecutor::RunOp(
details
::
OpHandleBase
*
op
)
{
auto
op_run
=
[
ready_var_q
,
op
,
this
]
{
try
{
if
(
VLOG_IS_ON
(
10
))
{
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
if
(
VLOG_IS_ON
(
10
0
))
{
VLOG
(
10
0
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
}
if
(
LIKELY
(
!
strategy_
.
dry_run_
))
{
op
->
Run
(
strategy_
.
use_cuda_
);
}
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
VLOG
(
10
0
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
running_ops_
--
;
ready_var_q
->
Extend
(
op
->
Outputs
());
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
"Signal posted"
;
VLOG
(
10
0
)
<<
op
<<
" "
<<
op
->
Name
()
<<
"Signal posted"
;
}
catch
(...)
{
exception_holder_
.
Catch
(
std
::
current_exception
());
}
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
d231e550
...
...
@@ -43,7 +43,7 @@ ExecutorPrepareContext::ExecutorPrepareContext(
}
ExecutorPrepareContext
::~
ExecutorPrepareContext
()
{
VLOG
(
5
)
<<
"destroy ExecutorPrepareContext"
;
VLOG
(
5
0
)
<<
"destroy ExecutorPrepareContext"
;
}
template
<
typename
RefCntMap
>
...
...
@@ -60,7 +60,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
if
((
it
->
second
)
--
==
1
)
{
auto
*
var
=
scope
.
FindVar
(
name
);
if
(
var
!=
nullptr
)
{
VLOG
(
10
)
<<
"Erase tensor
\'
"
<<
name
<<
"
\'
"
;
VLOG
(
10
0
)
<<
"Erase tensor
\'
"
<<
name
<<
"
\'
"
;
if
(
var
->
IsType
<
LoDTensor
>
())
{
erase_tensors
.
insert
(
var
->
GetMutable
<
LoDTensor
>
());
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
...
...
@@ -141,21 +141,21 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
if
(
var
->
Persistable
())
{
auto
*
ptr
=
const_cast
<
Scope
*>
(
ancestor_scope
)
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" global, which pointer is "
<<
ptr
;
VLOG
(
3
0
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" global, which pointer is "
<<
ptr
;
}
else
{
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" locally, which pointer is "
<<
ptr
;
VLOG
(
3
0
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" locally, which pointer is "
<<
ptr
;
}
}
}
else
{
for
(
auto
&
var
:
global_block
.
AllVars
())
{
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create variable "
<<
var
->
Name
()
<<
", which pointer is "
<<
ptr
;
VLOG
(
3
0
)
<<
"Create variable "
<<
var
->
Name
()
<<
", which pointer is "
<<
ptr
;
}
}
}
...
...
@@ -286,7 +286,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
int
i
=
0
;
for
(
auto
&
feed_target
:
(
*
feed_targets
))
{
std
::
string
var_name
=
feed_target
.
first
;
VLOG
(
3
)
<<
"feed target's name: "
<<
var_name
;
VLOG
(
3
0
)
<<
"feed target's name: "
<<
var_name
;
// prepend feed op
auto
*
op
=
global_block
->
PrependOp
();
...
...
@@ -309,7 +309,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
int
i
=
0
;
for
(
auto
&
fetch_target
:
(
*
fetch_targets
))
{
std
::
string
var_name
=
fetch_target
.
first
;
VLOG
(
3
)
<<
"fetch target's name: "
<<
var_name
;
VLOG
(
3
0
)
<<
"fetch target's name: "
<<
var_name
;
// append fetch op
auto
*
op
=
global_block
->
AppendOp
();
...
...
@@ -459,7 +459,7 @@ void Executor::RunPreparedContext(
void
Executor
::
EnableMKLDNN
(
const
ProgramDesc
&
program
)
{
#ifdef PADDLE_WITH_MKLDNN
VLOG
(
3
)
<<
"use_mkldnn=True"
;
VLOG
(
3
0
)
<<
"use_mkldnn=True"
;
for
(
size_t
bid
=
0
;
bid
<
program
.
Size
();
++
bid
)
{
auto
*
block
=
const_cast
<
ProgramDesc
&>
(
program
).
MutableBlock
(
bid
);
for
(
auto
*
op
:
block
->
AllOps
())
{
...
...
paddle/fluid/framework/feed_fetch_method.cc
浏览文件 @
d231e550
...
...
@@ -25,7 +25,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
const
std
::
string
&
var_name
,
size_t
index
)
{
// If var_name Variable is not found in GlobalScope, a new variable will
// be created.
VLOG
(
3
)
<<
"SetFeedVariable name="
<<
var_name
<<
" index="
<<
index
;
VLOG
(
3
0
)
<<
"SetFeedVariable name="
<<
var_name
<<
" index="
<<
index
;
Variable
*
g_feed_value
=
scope
->
Var
(
var_name
);
auto
&
feed_inputs
=
*
(
g_feed_value
->
GetMutable
<
FeedFetchList
>
());
if
(
index
>=
feed_inputs
.
size
())
{
...
...
@@ -47,8 +47,8 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
typeid
(
FeedFetchList
).
name
());
auto
&
fetch_outputs
=
*
g_fetch_value
->
GetMutable
<
FeedFetchList
>
();
auto
&
tensor
=
fetch_outputs
[
index
];
VLOG
(
3
)
<<
"Fetch "
<<
var_name
<<
" with index "
<<
index
<<
" shape= "
<<
tensor
.
dims
();
VLOG
(
3
0
)
<<
"Fetch "
<<
var_name
<<
" with index "
<<
index
<<
" shape= "
<<
tensor
.
dims
();
PADDLE_ENFORCE_LT
(
index
,
fetch_outputs
.
size
());
return
tensor
;
}
...
...
paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
浏览文件 @
d231e550
...
...
@@ -147,19 +147,19 @@ void PrepareParameters(Graph* graph, const Param& param) {
scope
->
Var
(
param
.
LSTMX
)
->
GetMutable
<
LoDTensor
>
();
scope
->
Var
(
param
.
LSTMOUT
)
->
GetMutable
<
LoDTensor
>
();
#define GATE_W(name__) \
auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0"); \
auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1"); \
auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0"); \
CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \
VLOG(4) << #name__ "_w0" \
<< " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
VLOG(4) << #name__ "_w1" \
<< " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
VLOG(4) << #name__ "_b0" \
<< " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>(); \
auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>(); \
#define GATE_W(name__)
\
auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0");
\
auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1");
\
auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0");
\
CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0);
\
VLOG(4
0
) << #name__ "_w0" \
<< " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
VLOG(4
0
) << #name__ "_w1" \
<< " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
VLOG(4
0
) << #name__ "_b0" \
<< " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>();
\
auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>();
\
auto& W_##name__##_b0_t = W_##name__##_b0->Get<LoDTensor>();
GATE_W
(
forget
);
...
...
@@ -208,7 +208,7 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
int
D
=
W_forget_w0
.
dims
()[
0
];
int
M
=
W_forget_w1
.
dims
()[
0
];
out
->
Resize
(
make_ddim
({
D
+
M
,
4
*
D
}));
VLOG
(
3
)
<<
"LSTMWeight resized to "
<<
out
->
dims
();
VLOG
(
3
0
)
<<
"LSTMWeight resized to "
<<
out
->
dims
();
float
*
out_data
=
out
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
std
::
array
<
const
float
*
,
4
>
tensors
(
...
...
paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
浏览文件 @
d231e550
...
...
@@ -57,7 +57,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
int
found_conv_bias_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle ConvBias fuse"
;
VLOG
(
4
0
)
<<
"handle ConvBias fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
conv_weight
,
conv_weight
,
conv_bias_pattern
);
// Filter
GET_IR_NODE_FROM_SUBGRAPH
(
conv_out
,
conv_out
,
conv_bias_pattern
);
// tmp
...
...
@@ -74,7 +74,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
// check if fuse can be done and if MKL-DNN should be used
FuseOptions
fuse_option
=
FindFuseOption
(
*
conv
,
*
eltwise
);
if
(
fuse_option
==
DO_NOT_FUSE
||
fuse_option
==
FUSE_NATIVE
)
{
VLOG
(
3
)
<<
"do not perform conv+bias fuse"
;
VLOG
(
3
0
)
<<
"do not perform conv+bias fuse"
;
return
;
}
...
...
paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
浏览文件 @
d231e550
...
...
@@ -121,7 +121,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
int
found_conv_bn_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle ConvBN fuse"
;
VLOG
(
4
0
)
<<
"handle ConvBN fuse"
;
// conv, batch_norm,
// conv_weight, conv_out,
...
...
@@ -133,7 +133,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
// check if fuse can be done and if MKL-DNN should be used
FuseOptions
fuse_option
=
FindFuseOption
(
*
conv
,
*
batch_norm
);
if
(
fuse_option
==
DO_NOT_FUSE
)
{
VLOG
(
3
)
<<
"do not perform conv+bn fuse"
;
VLOG
(
3
0
)
<<
"do not perform conv+bn fuse"
;
return
;
}
...
...
@@ -241,7 +241,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
int
found_conv_bn_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle ConvBN fuse"
;
VLOG
(
4
0
)
<<
"handle ConvBN fuse"
;
// conv, batch_norm,
// conv_weight, conv_out,
...
...
paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
浏览文件 @
d231e550
...
...
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
int
found_conv_relu_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle ConvReLU fuse"
;
VLOG
(
4
0
)
<<
"handle ConvReLU fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
conv_weight
,
conv_weight
,
conv_relu_pattern
);
// Filter
GET_IR_NODE_FROM_SUBGRAPH
(
conv_out
,
conv_out
,
conv_relu_pattern
);
// tmp
...
...
@@ -48,7 +48,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
FuseOptions
fuse_option
=
FindFuseOption
(
*
conv
,
*
relu
);
if
(
fuse_option
==
DO_NOT_FUSE
)
{
VLOG
(
3
)
<<
"do not perform conv+relu fuse"
;
VLOG
(
3
0
)
<<
"do not perform conv+relu fuse"
;
return
;
}
...
...
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
浏览文件 @
d231e550
...
...
@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
int
found_depthwise_conv_mkldnn_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
3
)
<<
"handle DepthwiseConvMKLDNN fuse"
;
VLOG
(
3
0
)
<<
"handle DepthwiseConvMKLDNN fuse"
;
GET_NODE
(
depthwise_conv
,
(
*
pattern
));
depthwise_conv
->
Op
()
->
SetType
(
"conv2d"
);
found_depthwise_conv_mkldnn_count
++
;
...
...
paddle/fluid/framework/ir/fc_fuse_pass.cc
浏览文件 @
d231e550
...
...
@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
int
found_fc_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle FC fuse"
;
VLOG
(
4
0
)
<<
"handle FC fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
w
,
w
,
fc_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
fc_bias
,
bias
,
fc_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
fc_out
,
Out
,
fc_pattern
);
...
...
paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
浏览文件 @
d231e550
...
...
@@ -61,7 +61,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle FuseElewiseAddAct fuse"
;
VLOG
(
4
0
)
<<
"handle FuseElewiseAddAct fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
ele_y
,
ele_y
,
elewise_add_act_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
ele_out
,
elewise_add_out
,
elewise_add_act_pattern
);
...
...
@@ -77,10 +77,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
Node
*
elewise_add_act_node
=
CreateFuseElewiseAddActNode
(
g
,
act
,
ele_add
,
ele_x_n
,
ele_y_n
,
ele_out_n
,
act_out_n
);
VLOG
(
4
)
<<
"
\n\t
"
<<
ele_x_n
<<
" and "
<<
ele_y_n
<<
" -> "
<<
ele_add
->
Name
()
<<
" -> "
<<
ele_out_n
<<
"
\n
"
<<
"
\t
"
<<
ele_out_n
<<
" -> "
<<
act
->
Name
()
<<
" -> "
<<
act_out_n
;
VLOG
(
4
0
)
<<
"
\n\t
"
<<
ele_x_n
<<
" and "
<<
ele_y_n
<<
" -> "
<<
ele_add
->
Name
()
<<
" -> "
<<
ele_out_n
<<
"
\n
"
<<
"
\t
"
<<
ele_out_n
<<
" -> "
<<
act
->
Name
()
<<
" -> "
<<
act_out_n
;
ReLinkNodes
(
g
,
ele_out
,
ele_add
,
act
,
elewise_add_act_node
);
found_elewise_add_act_count
++
;
...
...
@@ -113,7 +113,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle FuseElewiseAddAct fuse"
;
VLOG
(
4
0
)
<<
"handle FuseElewiseAddAct fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
act_out
,
act_out
,
act_elewise_add_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
ele_x
,
ele_x
,
act_elewise_add_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
ele_out
,
elewise_add_out
,
...
...
@@ -129,9 +129,9 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
Node
*
elewise_add_act_node
=
CreateFuseElewiseAddActNode
(
g
,
ele_add
,
act
,
elewise_add_x_n
,
act_i_n
,
act_o_n
,
elewise_add_out_n
);
VLOG
(
4
)
<<
"
\n\t
"
<<
act_i_n
<<
" -> "
<<
act
->
Name
()
<<
" -> "
<<
act_o_n
<<
"
\n\t
"
<<
act_o_n
<<
" and "
<<
elewise_add_x_n
<<
" -> "
<<
ele_add
->
Name
()
<<
" -> "
<<
elewise_add_out_n
;
VLOG
(
4
0
)
<<
"
\n\t
"
<<
act_i_n
<<
" -> "
<<
act
->
Name
()
<<
" -> "
<<
act_o_n
<<
"
\n\t
"
<<
act_o_n
<<
" and "
<<
elewise_add_x_n
<<
" -> "
<<
ele_add
->
Name
()
<<
" -> "
<<
elewise_add_out_n
;
ReLinkNodes
(
g
,
act_out
,
act
,
ele_add
,
elewise_add_act_node
);
found_elewise_add_act_count
++
;
...
...
@@ -165,7 +165,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle FuseElewiseAddActGrad1 fuse"
;
VLOG
(
4
0
)
<<
"handle FuseElewiseAddActGrad1 fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
act_out
,
act_out
,
elewise_add_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
act_grad
,
act_grad
,
elewise_add_act_grad_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
d_itermediate_out
,
d_itermediate_out
,
...
...
@@ -208,10 +208,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
auto
fused_node
=
g
->
CreateOpNode
(
&
desc
);
VLOG
(
4
)
<<
"
\n\t
"
<<
d_act_out_n
<<
" and "
<<
act_out_n
<<
" -> "
<<
act_grad
->
Name
()
<<
" -> "
<<
d_itermediate_out_n
<<
"
\n\t
"
<<
d_itermediate_out_n
<<
" and "
<<
act_out_n
<<
" -> "
<<
ele_add_grad
->
Name
()
<<
" -> "
<<
d_itermediate_out_n
;
VLOG
(
4
0
)
<<
"
\n\t
"
<<
d_act_out_n
<<
" and "
<<
act_out_n
<<
" -> "
<<
act_grad
->
Name
()
<<
" -> "
<<
d_itermediate_out_n
<<
"
\n\t
"
<<
d_itermediate_out_n
<<
" and "
<<
act_out_n
<<
" -> "
<<
ele_add_grad
->
Name
()
<<
" -> "
<<
d_itermediate_out_n
;
ReLinkNodes
(
g
,
d_itermediate_out
,
act_grad
,
ele_add_grad
,
fused_node
);
found_elewise_add_act_count
++
;
...
...
paddle/fluid/framework/ir/graph.cc
浏览文件 @
d231e550
...
...
@@ -92,7 +92,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
Graph
::
InitFromProgram
(
const
ProgramDesc
&
program
)
{
VLOG
(
3
)
<<
"block in program:"
<<
program_
.
Size
();
VLOG
(
3
0
)
<<
"block in program:"
<<
program_
.
Size
();
std
::
unordered_map
<
std
::
string
,
VarDesc
*>
all_vars
;
// var nodes for each var name, will have multiple versions in SSA
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
var_nodes
;
...
...
@@ -160,7 +160,7 @@ void Graph::ResolveHazard(
auto
it_old
=
versions
.
rbegin
();
++
it_old
;
for
(;
it_old
!=
versions
.
rend
();
it_new
=
it_old
,
++
it_old
)
{
VLOG
(
3
)
<<
"deal with var: "
<<
(
*
it_new
)
->
Name
();
VLOG
(
3
0
)
<<
"deal with var: "
<<
(
*
it_new
)
->
Name
();
ir
::
Node
*
write_op
=
(
*
it_new
)
->
inputs
.
empty
()
?
nullptr
:
(
*
it_new
)
->
inputs
[
0
];
const
auto
&
read_ops
=
(
*
it_old
)
->
outputs
;
...
...
paddle/fluid/framework/ir/graph.h
浏览文件 @
d231e550
...
...
@@ -89,7 +89,7 @@ class Graph {
attr_name
);
attrs_
[
attr_name
]
=
attr
;
attr_dels_
[
attr_name
]
=
[
attr
,
attr_name
]()
{
VLOG
(
3
)
<<
"deleting "
<<
attr_name
;
VLOG
(
3
0
)
<<
"deleting "
<<
attr_name
;
delete
attr
;
};
}
...
...
paddle/fluid/framework/ir/graph_helper.cc
浏览文件 @
d231e550
...
...
@@ -33,8 +33,9 @@ void SortHelper(
}
}
VLOG
(
3
)
<<
"topology sort insert: "
<<
node
->
Name
()
<<
reinterpret_cast
<
void
*>
(
node
)
<<
" input "
<<
node
->
inputs
.
size
();
VLOG
(
30
)
<<
"topology sort insert: "
<<
node
->
Name
()
<<
reinterpret_cast
<
void
*>
(
node
)
<<
" input "
<<
node
->
inputs
.
size
();
ret
->
push_back
(
node
);
}
...
...
@@ -103,9 +104,9 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
for
(
auto
&
var
:
n
->
inputs
)
{
for
(
auto
&
adj_n
:
var
->
inputs
)
{
PADDLE_ENFORCE
(
adj_n
->
NodeType
()
==
ir
::
Node
::
Type
::
kOperation
);
VLOG
(
4
)
<<
"adj "
<<
adj_n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
adj_n
)
<<
" -> "
<<
n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
n
)
<<
" via "
<<
var
->
Name
()
<<
reinterpret_cast
<
void
*>
(
var
);
VLOG
(
4
0
)
<<
"adj "
<<
adj_n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
adj_n
)
<<
" -> "
<<
n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
n
)
<<
" via "
<<
var
->
Name
()
<<
reinterpret_cast
<
void
*>
(
var
);
adj_list
[
n
].
insert
(
adj_n
);
}
}
...
...
@@ -163,10 +164,10 @@ size_t GraphNum(const Graph &graph) {
graph_nodes
.
emplace_back
(
g_nodes
);
}
if
(
VLOG_IS_ON
(
10
))
{
VLOG
(
10
)
<<
"graph_num: "
<<
graph_nodes
.
size
();
if
(
VLOG_IS_ON
(
10
0
))
{
VLOG
(
10
0
)
<<
"graph_num: "
<<
graph_nodes
.
size
();
for
(
auto
&
g_n
:
graph_nodes
)
{
VLOG
(
10
)
<<
"graph_nodes: "
<<
g_n
.
size
();
VLOG
(
10
0
)
<<
"graph_nodes: "
<<
g_n
.
size
();
if
(
g_n
.
size
()
<
10
)
{
std
::
stringstream
out
;
for
(
auto
&
node
:
g_n
)
{
...
...
@@ -180,7 +181,7 @@ size_t GraphNum(const Graph &graph) {
}
out
<<
"]"
;
}
VLOG
(
10
)
<<
out
.
str
();
VLOG
(
10
0
)
<<
out
.
str
();
}
}
}
...
...
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
d231e550
...
...
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <array>
#include <string>
#include <vector>
...
...
@@ -91,19 +92,19 @@ void GraphPatternDetector::operator()(Graph *graph,
PrettyLogEndl
(
Style
::
detail
(),
"--- detect %d subgraphs"
,
subgraphs
.
size
());
int
id
=
0
;
for
(
auto
&
g
:
subgraphs
)
{
VLOG
(
3
)
<<
"optimizing #"
<<
id
++
<<
" subgraph"
;
VLOG
(
3
0
)
<<
"optimizing #"
<<
id
++
<<
" subgraph"
;
handler
(
g
,
graph
);
}
}
bool
GraphPatternDetector
::
MarkPDNodesInGraph
(
const
ir
::
Graph
&
graph
)
{
VLOG
(
3
)
<<
"mark pdnodes in graph"
;
VLOG
(
3
0
)
<<
"mark pdnodes in graph"
;
if
(
graph
.
Nodes
().
empty
())
return
false
;
for
(
auto
&
node
:
GraphTraits
::
DFS
(
graph
))
{
for
(
const
auto
&
pdnode
:
pattern_
.
nodes
())
{
if
(
pdnode
->
Tell
(
&
node
))
{
VLOG
(
4
)
<<
"pdnode "
<<
pdnode
->
name
()
<<
" marked"
;
VLOG
(
4
0
)
<<
"pdnode "
<<
pdnode
->
name
()
<<
" marked"
;
pdnodes2nodes_
[
pdnode
.
get
()].
insert
(
&
node
);
}
}
...
...
@@ -111,7 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
// Check to early stop if some PDNode can't find matched Node.
for
(
auto
&
pdnode
:
pattern_
.
nodes
())
{
if
(
!
pdnodes2nodes_
.
count
(
pdnode
.
get
()))
{
VLOG
(
4
)
<<
pdnode
->
name
()
<<
" can't find matched Node, early stop"
;
VLOG
(
4
0
)
<<
pdnode
->
name
()
<<
" can't find matched Node, early stop"
;
// return false;
}
}
...
...
@@ -120,7 +121,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
GetMarkedNodes
(
const_cast
<
Graph
*>
(
&
graph
)).
insert
(
n
);
}
}
VLOG
(
3
)
<<
pdnodes2nodes_
.
size
()
<<
" nodes marked"
;
VLOG
(
3
0
)
<<
pdnodes2nodes_
.
size
()
<<
" nodes marked"
;
return
!
pdnodes2nodes_
.
empty
();
}
...
...
@@ -213,7 +214,7 @@ GraphPatternDetector::DetectPatterns() {
// Extend a PDNode to subgraphs by deducing the connection relations defined
// in edges of PDNodes.
for
(
const
auto
&
edge
:
pattern_
.
edges
())
{
VLOG
(
4
)
<<
"check "
<<
edge
.
first
->
name
()
<<
" -> "
<<
edge
.
second
->
name
();
VLOG
(
4
0
)
<<
"check "
<<
edge
.
first
->
name
()
<<
" -> "
<<
edge
.
second
->
name
();
// TODO(Superjomn) Fix bug here, the groups might be duplicate here.
// Each role has two PDNodes, which indicates two roles.
// Detect two Nodes that can match these two roles and they are connected.
...
...
@@ -224,7 +225,7 @@ GraphPatternDetector::DetectPatterns() {
// source -> target
for
(
Node
*
source
:
pdnodes2nodes_
[
edge
.
first
])
{
for
(
Node
*
target
:
pdnodes2nodes_
[
edge
.
second
])
{
VLOG
(
8
)
<<
"check "
<<
source
->
id
()
<<
" -- "
<<
target
->
id
();
VLOG
(
8
0
)
<<
"check "
<<
source
->
id
()
<<
" -- "
<<
target
->
id
();
// TODO(Superjomn) add some prune strategies.
for
(
const
auto
&
group
:
pre_groups
)
{
HitGroup
new_group
=
group
;
...
...
@@ -240,12 +241,13 @@ GraphPatternDetector::DetectPatterns() {
}
}
}
VLOG
(
3
)
<<
"step "
<<
step
<<
" get records: "
<<
cur_groups
.
size
();
VLOG
(
3
0
)
<<
"step "
<<
step
<<
" get records: "
<<
cur_groups
.
size
();
for
(
auto
&
group
:
cur_groups
)
{
for
(
auto
&
item
:
group
.
roles
)
{
VLOG
(
4
)
<<
"node "
<<
item
.
second
->
id
()
<<
" as "
<<
item
.
first
->
name
();
VLOG
(
40
)
<<
"node "
<<
item
.
second
->
id
()
<<
" as "
<<
item
.
first
->
name
();
}
VLOG
(
4
)
<<
"========================================================="
;
VLOG
(
4
0
)
<<
"========================================================="
;
}
}
...
...
paddle/fluid/framework/ir/graph_viz_pass.cc
浏览文件 @
d231e550
...
...
@@ -41,7 +41,7 @@ std::string FormatName(const Node* node) {
std
::
unique_ptr
<
ir
::
Graph
>
GraphVizPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
const
std
::
string
graph_viz_path
=
Get
<
std
::
string
>
(
kGraphVizPath
);
VLOG
(
3
)
<<
"draw IR graph viz to "
<<
graph_viz_path
;
VLOG
(
3
0
)
<<
"draw IR graph viz to "
<<
graph_viz_path
;
std
::
unique_ptr
<
std
::
ostream
>
fout
(
new
std
::
ofstream
(
graph_viz_path
));
PADDLE_ENFORCE
(
fout
->
good
());
std
::
ostream
&
sout
=
*
fout
;
...
...
paddle/fluid/framework/ir/mkldnn_placement_pass.cc
浏览文件 @
d231e550
...
...
@@ -20,7 +20,7 @@ namespace ir {
std
::
unique_ptr
<
ir
::
Graph
>
MKLDNNPlacementPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
VLOG
(
3
)
<<
"Aplies MKL-DNN placement strategy."
;
VLOG
(
3
0
)
<<
"Aplies MKL-DNN placement strategy."
;
for
(
const
Node
*
n
:
graph
->
Nodes
())
{
if
(
n
->
IsOp
()
&&
n
->
Op
()
->
HasAttr
(
"use_mkldnn"
))
{
n
->
Op
()
->
SetAttr
(
"use_mkldnn"
,
true
);
...
...
paddle/fluid/framework/ir/multi_batch_merge_pass.cc
浏览文件 @
d231e550
...
...
@@ -62,7 +62,7 @@ VarDesc UpdateGradVarDesc(
string
::
Sprintf
(
"%s.repeat.%d"
,
var_desc
->
Name
(),
repeat
);
VarDesc
repeated_var
=
CopyVarDesc
(
var_desc
);
repeated_var
.
SetName
(
new_gname
);
VLOG
(
3
)
<<
"update "
<<
var_desc
->
Name
()
<<
" to repeat "
<<
repeat
;
VLOG
(
3
0
)
<<
"update "
<<
var_desc
->
Name
()
<<
" to repeat "
<<
repeat
;
return
repeated_var
;
}
return
*
var_desc
;
...
...
@@ -78,7 +78,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
std
::
vector
<
ir
::
Node
*>
nodes
=
TopologySortOperations
(
*
graph
);
auto
origin_nodes
=
graph
->
ReleaseNodes
();
VLOG
(
3
)
<<
"origin nodes count: "
<<
origin_nodes
.
size
();
VLOG
(
3
0
)
<<
"origin nodes count: "
<<
origin_nodes
.
size
();
ir
::
Graph
&
result
=
*
graph
;
// 1. record op nodes of different roles
...
...
@@ -137,8 +137,8 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
"%s.repeat.%d"
,
repeated_op
.
Input
(
"Variance"
)[
0
],
i
);
bn_vars_need_rename
.
insert
(
repeated_op
.
Input
(
"Mean"
)[
0
]);
bn_vars_need_rename
.
insert
(
repeated_op
.
Input
(
"Variance"
)[
0
]);
VLOG
(
3
)
<<
"renaming "
<<
repeated_op
.
Input
(
"Mean"
)[
0
]
<<
" to "
<<
new_mean_name
;
VLOG
(
3
0
)
<<
"renaming "
<<
repeated_op
.
Input
(
"Mean"
)[
0
]
<<
" to "
<<
new_mean_name
;
repeated_op
.
RenameInput
(
repeated_op
.
Input
(
"Mean"
)[
0
],
new_mean_name
);
repeated_op
.
RenameInput
(
repeated_op
.
Input
(
"Variance"
)[
0
],
new_var_name
);
repeated_op
.
RenameOutput
(
repeated_op
.
Output
(
"MeanOut"
)[
0
],
...
...
paddle/fluid/framework/ir/pass.h
浏览文件 @
d231e550
...
...
@@ -76,7 +76,7 @@ class Pass {
attr_name
);
attrs_
[
attr_name
]
=
attr
;
attr_dels_
[
attr_name
]
=
[
attr
,
attr_name
]()
{
VLOG
(
3
)
<<
"deleting "
<<
attr_name
;
VLOG
(
3
0
)
<<
"deleting "
<<
attr_name
;
delete
attr
;
};
}
...
...
paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
浏览文件 @
d231e550
...
...
@@ -12,10 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
#include <set>
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
#include "paddle/fluid/framework/lod_tensor.h"
namespace
paddle
{
...
...
@@ -159,10 +162,7 @@ PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) {
std
::
set
<
std
::
string
>
acts
({
"sigmoid"
,
"tanh"
,
"relu"
,
"identity"
});
PDNode
*
act
=
pattern
->
NewNode
(
[
=
](
Node
*
x
)
{
return
x
&&
x
->
IsOp
()
&&
acts
.
count
(
x
->
Op
()
->
Type
());
},
[
=
](
Node
*
x
)
{
return
x
&&
x
->
IsOp
()
&&
acts
.
count
(
x
->
Op
()
->
Type
());
},
"act"
);
PDNode
*
fc_out
=
pattern
->
NewNode
(
...
...
@@ -196,7 +196,7 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
detector
(
graph
.
get
(),
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
graph
)
{
VLOG
(
4
)
<<
"get one concat pattern"
;
VLOG
(
4
0
)
<<
"get one concat pattern"
;
// fc
GET_NODE
(
fc_w
,
detector
.
pattern
());
GET_NODE
(
fc_bias
,
detector
.
pattern
());
...
...
paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
浏览文件 @
d231e550
...
...
@@ -60,7 +60,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
4
)
<<
"handle SeqConv EltAdd Relu fuse"
;
VLOG
(
4
0
)
<<
"handle SeqConv EltAdd Relu fuse"
;
GET_IR_NODE_FROM_SUBGRAPH
(
seqconv
,
seqconv
,
fuse_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
seqconv_weight
,
seqconv_weight
,
fuse_pattern
);
GET_IR_NODE_FROM_SUBGRAPH
(
seqconv_out
,
seqconv_out
,
fuse_pattern
);
...
...
paddle/fluid/framework/lod_rank_table.cc
浏览文件 @
d231e550
...
...
@@ -31,7 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
TableItem
item
;
item
.
index
=
i
;
item
.
length
=
vec
[
i
+
1
]
-
vec
[
i
];
VLOG
(
10
)
<<
"Add item to rank table "
<<
item
.
index
<<
" "
<<
item
.
length
;
VLOG
(
10
0
)
<<
"Add item to rank table "
<<
item
.
index
<<
" "
<<
item
.
length
;
items_
.
emplace_back
(
item
);
}
// NOTE(yuyang18):
...
...
paddle/fluid/framework/mixed_vector_test.cc
浏览文件 @
d231e550
...
...
@@ -51,7 +51,7 @@ TEST(mixed_vector, InitWithCount) {
TEST
(
mixed_vector
,
ForEach
)
{
vec
<
int
>
tmp
;
for
(
auto
&
v
:
tmp
)
{
VLOG
(
3
)
<<
v
;
VLOG
(
3
0
)
<<
v
;
}
}
...
...
paddle/fluid/framework/naive_executor.cc
浏览文件 @
d231e550
...
...
@@ -71,7 +71,7 @@ void NaiveExecutor::Prepare(Scope *parent_scope,
void
NaiveExecutor
::
Run
()
{
for
(
auto
&
op
:
ops_
)
{
VLOG
(
4
)
<<
"run "
<<
op
->
Type
();
VLOG
(
4
0
)
<<
"run "
<<
op
->
Type
();
op
->
Run
(
*
scope_
,
place_
);
}
}
...
...
@@ -95,21 +95,21 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
if
(
var
->
Persistable
())
{
auto
*
ptr
=
const_cast
<
Scope
*>
(
ancestor_scope
)
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" global, which pointer is "
<<
ptr
;
VLOG
(
3
0
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" global, which pointer is "
<<
ptr
;
}
else
{
// Create temporary variables in local scope.
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" locally, which pointer is "
<<
ptr
;
VLOG
(
3
0
)
<<
"Create Variable "
<<
var
->
Name
()
<<
" locally, which pointer is "
<<
ptr
;
}
}
}
else
{
for
(
auto
&
var
:
global_block
.
AllVars
())
{
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create variable "
<<
var
->
Name
()
<<
", which pointer is "
<<
ptr
;
VLOG
(
3
0
)
<<
"Create variable "
<<
var
->
Name
()
<<
", which pointer is "
<<
ptr
;
}
}
}
...
...
paddle/fluid/framework/op_desc.cc
浏览文件 @
d231e550
...
...
@@ -82,7 +82,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
auto
*
in_var
=
block_
.
FindVarRecursive
(
Inputs
(
in
)[
i
]);
auto
*
out_var
=
block_
.
FindVarRecursive
(
Outputs
(
out
)[
j
]);
if
(
in_var
->
GetType
()
!=
proto
::
VarType
::
LOD_TENSOR
)
{
VLOG
(
3
)
<<
"input "
<<
in
<<
" is not LodTensor"
;
VLOG
(
3
0
)
<<
"input "
<<
in
<<
" is not LodTensor"
;
return
;
}
out_var
->
SetLoDLevel
(
in_var
->
GetLoDLevel
());
...
...
@@ -241,32 +241,32 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
const
proto
::
OpProto
::
Attr
&
attr
=
GetProtoAttr
(
name
);
switch
(
attr
.
type
())
{
case
proto
::
AttrType
::
BOOLEANS
:
{
VLOG
(
11
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to BOOLEANS"
;
VLOG
(
11
0
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to BOOLEANS"
;
this
->
attrs_
[
name
]
=
std
::
vector
<
bool
>
();
break
;
}
case
proto
::
AttrType
::
INTS
:
{
VLOG
(
11
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to INTS"
;
VLOG
(
11
0
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to INTS"
;
this
->
attrs_
[
name
]
=
std
::
vector
<
int
>
();
break
;
}
case
proto
::
AttrType
::
FLOATS
:
{
VLOG
(
11
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to FLOATS"
;
VLOG
(
11
0
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to FLOATS"
;
this
->
attrs_
[
name
]
=
std
::
vector
<
float
>
();
break
;
}
case
proto
::
AttrType
::
STRINGS
:
{
VLOG
(
11
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to STRINGS"
;
VLOG
(
11
0
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to STRINGS"
;
this
->
attrs_
[
name
]
=
std
::
vector
<
std
::
string
>
();
break
;
}
case
proto
::
AttrType
::
BLOCKS
:
{
VLOG
(
11
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to BLOCKS"
;
VLOG
(
11
0
)
<<
"SetAttr: "
<<
Type
()
<<
", "
<<
name
<<
" from INTS to BLOCKS"
;
this
->
SetBlocksAttr
(
name
,
std
::
vector
<
BlockDesc
*>
());
return
;
}
...
...
@@ -499,13 +499,13 @@ void OpDesc::CheckAttrs() {
}
void
OpDesc
::
InferShape
(
const
BlockDesc
&
block
)
const
{
VLOG
(
3
)
<<
"CompileTime infer shape on "
<<
Type
();
VLOG
(
3
0
)
<<
"CompileTime infer shape on "
<<
Type
();
InitInferShapeFuncs
();
auto
&
infer_shape
=
OpInfoMap
::
Instance
().
Get
(
this
->
Type
()).
infer_shape_
;
PADDLE_ENFORCE
(
static_cast
<
bool
>
(
infer_shape
),
"%s's infer_shape has not been registered"
,
this
->
Type
());
CompileTimeInferShapeContext
ctx
(
*
this
,
block
);
if
(
VLOG_IS_ON
(
10
))
{
if
(
VLOG_IS_ON
(
10
0
))
{
std
::
ostringstream
sout
;
auto
inames
=
this
->
InputArgumentNames
();
sout
<<
" From ["
;
...
...
@@ -516,7 +516,7 @@ void OpDesc::InferShape(const BlockDesc &block) const {
std
::
copy
(
onames
.
begin
(),
onames
.
end
(),
std
::
ostream_iterator
<
std
::
string
>
(
sout
,
", "
));
sout
<<
"]"
;
VLOG
(
10
)
<<
sout
.
str
();
VLOG
(
10
0
)
<<
sout
.
str
();
}
infer_shape
(
&
ctx
);
}
...
...
@@ -607,7 +607,7 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto
shape
=
var
->
GetShape
();
res
=
shape
.
empty
()
?
make_ddim
({
0UL
})
:
make_ddim
(
shape
);
}
catch
(...)
{
VLOG
(
5
)
<<
"GetDim of variable "
<<
name
<<
" error"
;
VLOG
(
5
0
)
<<
"GetDim of variable "
<<
name
<<
" error"
;
std
::
rethrow_exception
(
std
::
current_exception
());
}
return
res
;
...
...
@@ -624,7 +624,7 @@ std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
res
.
push_back
(
s
.
empty
()
?
make_ddim
({
0UL
})
:
make_ddim
(
s
));
}
}
catch
(...)
{
VLOG
(
5
)
<<
"GetRepeatedDim of variable "
<<
name
<<
" error."
;
VLOG
(
5
0
)
<<
"GetRepeatedDim of variable "
<<
name
<<
" error."
;
std
::
rethrow_exception
(
std
::
current_exception
());
}
return
res
;
...
...
paddle/fluid/framework/op_registry.cc
浏览文件 @
d231e550
...
...
@@ -46,9 +46,9 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
std
::
unique_ptr
<
OperatorBase
>
OpRegistry
::
CreateOp
(
const
proto
::
OpDesc
&
op_desc
)
{
VLOG
(
1
)
<<
"CreateOp directly from OpDesc is deprecated. It should only be"
"used in unit tests. Use CreateOp(const OpDesc& op_desc) "
"instead."
;
VLOG
(
1
0
)
<<
"CreateOp directly from OpDesc is deprecated. It should only be"
"used in unit tests. Use CreateOp(const OpDesc& op_desc) "
"instead."
;
VariableNameMap
inputs
=
ConvertOpDescVarsToVarNameMap
(
op_desc
.
inputs
());
VariableNameMap
outputs
=
ConvertOpDescVarsToVarNameMap
(
op_desc
.
outputs
());
AttributeMap
attrs
;
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
d231e550
...
...
@@ -140,7 +140,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
}
void
OperatorBase
::
Run
(
const
Scope
&
scope
,
const
platform
::
Place
&
place
)
{
VLOG
(
4
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
VLOG
(
4
0
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
if
(
platform
::
is_gpu_place
(
place
))
{
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW
(
"Cannot run operator on place %s"
,
place
);
...
...
@@ -160,7 +160,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
}
else
{
RunImpl
(
scope
,
place
);
}
VLOG
(
3
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
VLOG
(
3
0
)
<<
place
<<
" "
<<
DebugStringEx
(
&
scope
);
}
bool
OperatorBase
::
HasInputs
(
const
std
::
string
&
name
)
const
{
...
...
@@ -259,6 +259,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
if
(
row_size
>=
0
)
{
ss
<<
"[row_size="
<<
row_size
<<
"]"
;
}
std
::
string
dtype
=
GetDtype
(
*
scope
,
output
.
second
[
i
]);
ss
<<
":"
<<
dtype
;
ss
<<
"["
<<
GetDims
(
*
scope
,
var_name
,
true
)
<<
"]"
;
ss
<<
"("
<<
GetLoD
(
*
scope
,
var_name
)
<<
")"
;
}
...
...
@@ -715,14 +717,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
auto
expected_kernel_key
=
this
->
GetExpectedKernelType
(
ExecutionContext
(
*
this
,
scope
,
*
dev_ctx
));
VLOG
(
3
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
VLOG
(
3
0
)
<<
"expected_kernel_key:"
<<
expected_kernel_key
;
auto
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
#ifdef PADDLE_WITH_MKLDNN
// workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
if
(
kernel_iter
==
kernels
.
end
()
&&
expected_kernel_key
.
library_type_
==
LibraryType
::
kMKLDNN
)
{
VLOG
(
3
)
<<
"missing MKLDNN kernel: fallbacking to PLAIN one"
;
VLOG
(
3
0
)
<<
"missing MKLDNN kernel: fallbacking to PLAIN one"
;
expected_kernel_key
.
library_type_
=
LibraryType
::
kPlain
;
expected_kernel_key
.
data_layout_
=
DataLayout
::
kAnyLayout
;
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
...
...
@@ -774,7 +776,8 @@ void OperatorWithKernel::TransferInplaceVarsBack(
const
Scope
&
scope
,
const
std
::
vector
<
std
::
string
>&
inplace_vars
,
const
Scope
&
transfer_scope
)
const
{
for
(
auto
&
var_name
:
inplace_vars
)
{
VLOG
(
3
)
<<
"share inplace var "
+
var_name
+
" back to it's original scope"
;
VLOG
(
30
)
<<
"share inplace var "
+
var_name
+
" back to it's original scope"
;
auto
*
original_tensor
=
GetMutableLoDTensorOrSelectedRowsValueFromVar
(
scope
.
FindVar
(
var_name
));
auto
*
var
=
transfer_scope
.
FindVar
(
var_name
);
...
...
@@ -815,8 +818,8 @@ Scope* OperatorWithKernel::TryTransferData(
transfered_inplace_vars
->
emplace_back
(
var_name
);
}
VLOG
(
3
)
<<
"Transform Variable "
<<
var_name
<<
" from "
<<
kernel_type_for_var
<<
" to "
<<
expected_kernel_key
;
VLOG
(
3
0
)
<<
"Transform Variable "
<<
var_name
<<
" from "
<<
kernel_type_for_var
<<
" to "
<<
expected_kernel_key
;
if
(
new_scope
==
nullptr
)
{
new_scope
=
&
scope
.
NewScope
();
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
d231e550
...
...
@@ -199,7 +199,7 @@ void ParallelExecutor::BCastParamsToDevices(
auto
&
main_tensor
=
main_var
->
Get
<
LoDTensor
>
();
if
(
!
main_tensor
.
IsInitialized
())
{
VLOG
(
3
)
<<
"one in var not inited, return!"
;
VLOG
(
3
0
)
<<
"one in var not inited, return!"
;
continue
;
}
auto
&
dims
=
main_tensor
.
dims
();
...
...
paddle/fluid/framework/scope.cc
浏览文件 @
d231e550
...
...
@@ -149,7 +149,7 @@ Variable* Scope::VarInternal(const std::string& name) {
v
=
new
Variable
();
vars_
[
name
].
reset
(
v
);
VLOG
(
3
)
<<
"Create variable "
<<
name
;
VLOG
(
3
0
)
<<
"Create variable "
<<
name
;
v
->
name_
=
&
(
vars_
.
find
(
name
)
->
first
);
return
v
;
}
...
...
paddle/fluid/framework/selected_rows.cc
浏览文件 @
d231e550
...
...
@@ -176,7 +176,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
PADDLE_ENFORCE
(
value
->
IsInitialized
(),
"The value tensor should be initialized."
);
if
(
ids
.
numel
()
==
0
)
{
VLOG
(
3
)
<<
"keys is empty, please check data!"
;
VLOG
(
3
0
)
<<
"keys is empty, please check data!"
;
}
else
{
int64_t
value_width
=
value_
->
numel
()
/
value_
->
dims
()[
0
];
PADDLE_ENFORCE_EQ
(
value_width
,
value
->
numel
()
/
value
->
dims
()[
0
],
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
d231e550
...
...
@@ -23,8 +23,8 @@ namespace framework {
void
TensorCopy
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
dst
)
{
VLOG
(
3
)
<<
"TensorCopy "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
VLOG
(
3
0
)
<<
"TensorCopy "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
src
.
check_memory_size
();
dst
->
Resize
(
src
.
dims
());
...
...
@@ -38,8 +38,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
<<
dst_place
;
VLOG
(
3
0
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
...
...
@@ -78,8 +78,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
ctx
).
stream
();
if
(
platform
::
is_same_place
(
src_place
,
dst_place
))
{
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
<<
dst_place
;
VLOG
(
3
0
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
...
...
@@ -115,8 +115,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
void
TensorCopySync
(
const
Tensor
&
src
,
const
platform
::
Place
&
dst_place
,
Tensor
*
dst
)
{
VLOG
(
3
)
<<
"TensorCopySync "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
VLOG
(
3
0
)
<<
"TensorCopySync "
<<
src
.
dims
()
<<
" from "
<<
src
.
place
()
<<
" to "
<<
dst_place
;
src
.
check_memory_size
();
dst
->
Resize
(
src
.
dims
());
dst
->
set_layout
(
src
.
layout
());
...
...
@@ -126,8 +126,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
auto
size
=
src
.
numel
()
*
SizeOfType
(
src
.
type
());
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_cpu_place
(
dst_place
))
{
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
<<
dst_place
;
VLOG
(
3
0
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
memory
::
Copy
(
boost
::
get
<
platform
::
CPUPlace
>
(
dst_place
),
dst_ptr
,
...
...
@@ -147,8 +147,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
}
else
if
(
platform
::
is_gpu_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
if
(
src_ptr
==
dst_ptr
&&
platform
::
is_same_place
(
src_place
,
dst_place
))
{
VLOG
(
3
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
<<
dst_place
;
VLOG
(
3
0
)
<<
"Skip copy the same data from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
...
...
paddle/fluid/framework/threadpool.cc
浏览文件 @
d231e550
...
...
@@ -39,7 +39,7 @@ void ThreadPool::Init() {
int
num_threads
=
std
::
thread
::
hardware_concurrency
();
if
(
FLAGS_dist_threadpool_size
>
0
)
{
num_threads
=
FLAGS_dist_threadpool_size
;
VLOG
(
1
)
<<
"set dist_threadpool_size to "
<<
num_threads
;
VLOG
(
1
0
)
<<
"set dist_threadpool_size to "
<<
num_threads
;
}
PADDLE_ENFORCE_GT
(
num_threads
,
0
);
threadpool_
.
reset
(
new
ThreadPool
(
num_threads
));
...
...
paddle/fluid/framework/var_desc.cc
浏览文件 @
d231e550
...
...
@@ -61,10 +61,10 @@ size_t VarDesc::GetTensorDescNum() const {
void
VarDesc
::
SetShapes
(
const
std
::
vector
<
std
::
vector
<
int64_t
>>
&
multiple_dims
)
{
if
(
multiple_dims
.
size
()
!=
GetTensorDescNum
())
{
VLOG
(
3
)
<<
"WARNING: The number of given shapes("
<<
multiple_dims
.
size
()
<<
") doesn't match the existing tensor number("
<<
GetTensorDescNum
()
<<
"). The Reader is going to be reinitialized."
;
VLOG
(
3
0
)
<<
"WARNING: The number of given shapes("
<<
multiple_dims
.
size
()
<<
") doesn't match the existing tensor number("
<<
GetTensorDescNum
()
<<
"). The Reader is going to be reinitialized."
;
SetTensorDescNum
(
multiple_dims
.
size
());
}
std
::
vector
<
proto
::
VarType
::
TensorDesc
*>
tensors
=
mutable_tensor_descs
();
...
...
@@ -94,11 +94,11 @@ void VarDesc::SetDataType(proto::VarType::Type data_type) {
void
VarDesc
::
SetDataTypes
(
const
std
::
vector
<
proto
::
VarType
::
Type
>
&
multiple_data_type
)
{
if
(
multiple_data_type
.
size
()
!=
GetTensorDescNum
())
{
VLOG
(
3
)
<<
"WARNING: The number of given data types("
<<
multiple_data_type
.
size
()
<<
") doesn't match the existing tensor number("
<<
GetTensorDescNum
()
<<
"). The Reader is going to be reinitialized."
;
VLOG
(
3
0
)
<<
"WARNING: The number of given data types("
<<
multiple_data_type
.
size
()
<<
") doesn't match the existing tensor number("
<<
GetTensorDescNum
()
<<
"). The Reader is going to be reinitialized."
;
SetTensorDescNum
(
multiple_data_type
.
size
());
}
std
::
vector
<
proto
::
VarType
::
TensorDesc
*>
tensor_descs
=
...
...
@@ -139,11 +139,11 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
void
VarDesc
::
SetLoDLevels
(
const
std
::
vector
<
int32_t
>
&
multiple_lod_level
)
{
if
(
multiple_lod_level
.
size
()
!=
GetTensorDescNum
())
{
VLOG
(
3
)
<<
"WARNING: The number of given lod_levels("
<<
multiple_lod_level
.
size
()
<<
") doesn't match the existing tensor number("
<<
GetTensorDescNum
()
<<
"). The Reader is going to be reinitialized."
;
VLOG
(
3
0
)
<<
"WARNING: The number of given lod_levels("
<<
multiple_lod_level
.
size
()
<<
") doesn't match the existing tensor number("
<<
GetTensorDescNum
()
<<
"). The Reader is going to be reinitialized."
;
SetTensorDescNum
(
multiple_lod_level
.
size
());
}
switch
(
desc_
.
type
().
type
())
{
...
...
paddle/fluid/framework/var_type_inference.h
浏览文件 @
d231e550
...
...
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/type_defs.h"
namespace
paddle
{
...
...
@@ -24,5 +27,27 @@ class VarTypeInference {
virtual
void
operator
()(
const
OpDesc
&
op_desc
,
BlockDesc
*
block
)
const
=
0
;
};
class
PassInDtypeAndVarTypeToOutput
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
final
{
auto
in_out_var_names
=
this
->
GetInputOutputWithSameType
();
for
(
auto
&
i_o_n
:
in_out_var_names
)
{
auto
&
x_name
=
op_desc
.
Input
(
i_o_n
.
first
).
at
(
0
);
auto
&
out_name
=
op_desc
.
Output
(
i_o_n
.
second
).
at
(
0
);
auto
&
x
=
block
->
FindRecursiveOrCreateVar
(
x_name
);
auto
&
out
=
block
->
FindRecursiveOrCreateVar
(
out_name
);
out
.
SetType
(
x
.
GetType
());
out
.
SetDataType
(
x
.
GetDataType
());
}
}
protected:
virtual
std
::
unordered_map
<
std
::
string
,
std
::
string
>
GetInputOutputWithSameType
()
const
=
0
;
};
}
// namespace framework
}
// namespace paddle
paddle/fluid/inference/analysis/analyzer.cc
浏览文件 @
d231e550
...
...
@@ -60,7 +60,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
private:
void
AddPass
(
const
std
::
string
&
name
,
AnalysisPass
*
pass
)
{
VLOG
(
3
)
<<
"Adding pass "
<<
name
;
VLOG
(
3
0
)
<<
"Adding pass "
<<
name
;
Register
(
name
,
pass
);
AddGraphvizDebugerPass
(
pass
);
}
...
...
@@ -104,7 +104,7 @@ void Analyzer::Run(Argument* argument) {
passes
.
push_back
(
"graph_viz_pass"
);
// add graphviz for debug.
#ifdef PADDLE_WITH_MKLDNN
if
(
use_mkldnn_
)
{
VLOG
(
3
)
<<
"Adding MKL-DNN placement pass"
;
VLOG
(
3
0
)
<<
"Adding MKL-DNN placement pass"
;
passes
.
push_back
(
"mkldnn_placement_pass"
);
}
#endif
...
...
@@ -113,7 +113,9 @@ void Analyzer::Run(Argument* argument) {
passes
.
push_back
(
"infer_clean_graph_pass"
);
passes
.
push_back
(
"graph_viz_pass"
);
// add graphviz for debug.
for
(
auto
&
pass
:
ir_passes_
)
{
if
(
!
disabled_ir_passes_
.
count
(
pass
))
{
// skip mkldnn pass when use_mkldnn_ = false;
bool
skip_pass
=
(
!
use_mkldnn_
)
&&
pass
.
find
(
"mkldnn"
)
!=
std
::
string
::
npos
;
if
(
!
disabled_ir_passes_
.
count
(
pass
)
&&
!
skip_pass
)
{
passes
.
push_back
(
pass
);
passes
.
push_back
(
"graph_viz_pass"
);
// add graphviz for debug.
}
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
d231e550
...
...
@@ -68,8 +68,8 @@ struct Argument {
key
);
attrs_
[
key
]
=
data
;
attr_deleters_
[
key
]
=
[
data
,
key
]()
{
VLOG
(
3
)
<<
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
;
VLOG
(
3
)
<<
"argument delete attr: "
<<
key
;
VLOG
(
3
0
)
<<
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
;
VLOG
(
3
0
)
<<
"argument delete attr: "
<<
key
;
delete
data
;
};
}
...
...
paddle/fluid/inference/analysis/data_flow_graph.cc
浏览文件 @
d231e550
...
...
@@ -132,7 +132,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
Node
*
x
{
nullptr
};
if
(
ir_node
->
IsOp
())
{
PADDLE_ENFORCE
(
ir_node
->
Op
());
VLOG
(
4
)
<<
"get op "
<<
ir_node
<<
" "
<<
ir_node
->
Name
();
VLOG
(
4
0
)
<<
"get op "
<<
ir_node
<<
" "
<<
ir_node
->
Name
();
x
=
nodes
.
Create
(
Node
::
Type
::
kFunction
);
x
->
attr
(
"ir_node"
).
Pointer
()
=
ir_node
;
PADDLE_ENFORCE
(
ir_node
->
Op
()
->
Proto
());
...
...
@@ -141,7 +141,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
}
else
if
(
ir_node
->
IsVar
())
{
// Not create a Node for IR ControlDepVar, considering Inference currently
// just used in single thread scenerio.
VLOG
(
4
)
<<
"get var "
<<
ir_node
->
Name
();
VLOG
(
4
0
)
<<
"get var "
<<
ir_node
->
Name
();
x
=
nodes
.
Create
(
Node
::
Type
::
kValue
);
x
->
attr
(
"ir_node"
).
Pointer
()
=
ir_node
;
x
->
SetName
(
ir_node
->
Name
());
...
...
@@ -151,9 +151,9 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
}
ir_node_map
.
emplace
(
ir_node
,
x
);
}
VLOG
(
4
)
<<
"finish creating Nodes"
;
VLOG
(
4
0
)
<<
"finish creating Nodes"
;
VLOG
(
4
)
<<
"to create edge"
;
VLOG
(
4
0
)
<<
"to create edge"
;
// Create links
for
(
auto
*
ir_node
:
graph
.
Nodes
())
{
auto
it
=
ir_node_map
.
find
(
ir_node
);
...
...
@@ -175,7 +175,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
"Can't deduce any inputs from the graph, Is the graph empty?"
);
ir_graph
=
&
graph
;
VLOG
(
3
)
<<
"finished build from IR"
;
VLOG
(
3
0
)
<<
"finished build from IR"
;
}
void
DataFlowGraph
::
Clean
()
{
...
...
paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
浏览文件 @
d231e550
...
...
@@ -239,9 +239,10 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
framework
::
BlockDesc
block_desc
(
nullptr
,
&
proto
);
block_desc
.
Proto
()
->
set_parent_idx
(
-
1
);
block_desc
.
Proto
()
->
set_idx
(
0
);
VLOG
(
4
)
<<
"origin variable size: "
<<
argument_
->
origin_program_desc
->
blocks
(
0
).
vars
().
size
();
VLOG
(
4
)
<<
"transformed variable size: "
<<
block_desc
.
Proto
()
->
vars
().
size
();
VLOG
(
40
)
<<
"origin variable size: "
<<
argument_
->
origin_program_desc
->
blocks
(
0
).
vars
().
size
();
VLOG
(
40
)
<<
"transformed variable size: "
<<
block_desc
.
Proto
()
->
vars
().
size
();
// copy ops.
for
(
auto
*
node
:
block_node
->
subgraph
)
{
...
...
paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
浏览文件 @
d231e550
...
...
@@ -29,7 +29,7 @@ void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
auto
png_path
=
dot_path
.
substr
(
0
,
dot_path
.
size
()
-
4
)
+
".png"
;
std
::
string
message
;
VLOG
(
3
)
<<
"draw to "
<<
png_path
;
VLOG
(
3
0
)
<<
"draw to "
<<
png_path
;
ExecShellCommand
(
"dot -Tpng "
+
dot_path
+
" -o "
+
png_path
,
&
message
);
}
...
...
paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
浏览文件 @
d231e550
...
...
@@ -29,7 +29,7 @@ void FluidToIrPass::EnableParamModify(const std::string &model_dir,
PADDLE_ENFORCE
(
argument_
);
argument_
->
Set
(
framework
::
ir
::
kParamScopeAttr
,
new
framework
::
Scope
);
// Load parameters.
VLOG
(
3
)
<<
"Loading parameters from "
<<
model_dir
;
VLOG
(
3
0
)
<<
"Loading parameters from "
<<
model_dir
;
LoadParams
(
&
argument_
->
Get
<
framework
::
Scope
>
(
framework
::
ir
::
kParamScopeAttr
),
model_dir
,
prog_file
,
param_file
);
}
...
...
paddle/fluid/inference/analysis/model_store_pass.cc
浏览文件 @
d231e550
...
...
@@ -35,21 +35,21 @@ void ModelStorePass::Run(DataFlowGraph *x) {
std
::
stringstream
ss
;
// NOTE these commands only works on linux.
ss
<<
"mkdir -p "
<<
*
argument_
->
model_output_store_path
;
VLOG
(
3
)
<<
"run command: "
<<
ss
.
str
();
VLOG
(
3
0
)
<<
"run command: "
<<
ss
.
str
();
PADDLE_ENFORCE_EQ
(
system
(
ss
.
str
().
c_str
()),
0
);
ss
.
str
(
""
);
ss
<<
"cp "
<<
*
argument_
->
fluid_model_dir
<<
"/*"
<<
" "
<<
*
argument_
->
model_output_store_path
;
VLOG
(
3
)
<<
"run command: "
<<
ss
.
str
();
VLOG
(
3
0
)
<<
"run command: "
<<
ss
.
str
();
PADDLE_ENFORCE_EQ
(
system
(
ss
.
str
().
c_str
()),
0
);
// Store program
PADDLE_ENFORCE_NOT_NULL
(
argument_
->
transformed_program_desc
,
"program desc is not transformed, should call "
"DataFlowGraphToFluidPass first."
);
VLOG
(
3
)
<<
"store analyzed program to "
<<
*
argument_
->
model_output_store_path
;
VLOG
(
3
0
)
<<
"store analyzed program to "
<<
*
argument_
->
model_output_store_path
;
const
std
::
string
program_output_path
=
*
argument_
->
model_output_store_path
+
"/__model__"
;
std
::
ofstream
file
(
program_output_path
,
std
::
ios
::
binary
);
...
...
paddle/fluid/inference/analysis/pass_manager.cc
浏览文件 @
d231e550
...
...
@@ -23,7 +23,7 @@ namespace analysis {
bool
PassManager
::
Initialize
(
Argument
*
argument
)
{
argument_
=
argument
;
for
(
auto
&
pass
:
data_
)
{
VLOG
(
3
)
<<
"Initializing pass ["
<<
pass
->
repr
()
<<
"]"
;
VLOG
(
3
0
)
<<
"Initializing pass ["
<<
pass
->
repr
()
<<
"]"
;
if
(
!
pass
->
Initialize
(
argument
))
{
LOG
(
ERROR
)
<<
"Failed to initialize pass ["
<<
pass
->
repr
()
<<
"]"
;
return
false
;
...
...
@@ -34,7 +34,7 @@ bool PassManager::Initialize(Argument* argument) {
void
DfgPassManager
::
RunAll
()
{
PADDLE_ENFORCE
(
argument_
);
VLOG
(
3
)
<<
"Total "
<<
data_
.
size
()
<<
" Analysys passes"
;
VLOG
(
3
0
)
<<
"Total "
<<
data_
.
size
()
<<
" Analysys passes"
;
for
(
auto
&
pass
:
data_
)
{
string
::
PrettyLogEndl
(
string
::
Style
::
H1
(),
"* Running Analysis pass [%s]"
,
pass
->
repr
());
...
...
paddle/fluid/inference/analysis/subgraph_splitter.cc
浏览文件 @
d231e550
...
...
@@ -232,7 +232,7 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
BriefNode
*
brief_node
=
itr
.
second
;
if
(
!
brief_node
->
node
->
attr
(
kMarkerAttrName
).
Bool
())
{
VLOG
(
4
)
<<
brief_node
->
node
->
id
()
<<
" node not a trt candicate."
;
VLOG
(
4
0
)
<<
brief_node
->
node
->
id
()
<<
" node not a trt candicate."
;
continue
;
}
...
...
paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
浏览文件 @
d231e550
...
...
@@ -25,9 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
void
TensorRTSubGraphPass
::
Run
(
DataFlowGraph
*
graph
)
{
SubGraphFuse
(
graph
,
node_inside_subgraph_teller_
,
argument_
)();
VLOG
(
4
)
<<
"debug info "
<<
graph
->
HumanReadableInfo
(
false
/*show_values*/
,
true
/*show_functions*/
);
VLOG
(
4
0
)
<<
"debug info "
<<
graph
->
HumanReadableInfo
(
false
/*show_values*/
,
true
/*show_functions*/
);
}
}
// namespace analysis
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
d231e550
...
...
@@ -38,7 +38,7 @@ using contrib::AnalysisConfig;
bool
AnalysisPredictor
::
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>
&
parent_scope
,
const
std
::
shared_ptr
<
framework
::
ProgramDesc
>
&
program
)
{
VLOG
(
3
)
<<
"Predictor::init()"
;
VLOG
(
3
0
)
<<
"Predictor::init()"
;
#if !defined(_WIN32)
if
(
FLAGS_profile
)
{
LOG
(
WARNING
)
<<
"Profiler is actived, might affect the performance"
;
...
...
@@ -89,7 +89,7 @@ bool AnalysisPredictor::Init(
bool
AnalysisPredictor
::
Run
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
std
::
vector
<
PaddleTensor
>
*
output_data
,
int
batch_size
)
{
VLOG
(
3
)
<<
"Predictor::predict"
;
VLOG
(
3
0
)
<<
"Predictor::predict"
;
inference
::
Timer
timer
;
timer
.
tic
();
// set feed variable
...
...
@@ -109,7 +109,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG
(
ERROR
)
<<
"fail to get fetches"
;
return
false
;
}
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
VLOG
(
3
0
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_
.
CollectTensorArrays
(
scope_
.
get
());
...
...
@@ -119,7 +119,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
bool
AnalysisPredictor
::
SetFeed
(
const
std
::
vector
<
PaddleTensor
>
&
inputs
,
framework
::
Scope
*
scope
)
{
VLOG
(
3
)
<<
"Predictor::set_feed"
;
VLOG
(
3
0
)
<<
"Predictor::set_feed"
;
if
(
inputs
.
size
()
!=
feeds_
.
size
())
{
LOG
(
ERROR
)
<<
"wrong feed input size, need "
<<
feeds_
.
size
()
<<
" but get "
<<
inputs
.
size
();
...
...
@@ -184,7 +184,7 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
bool
AnalysisPredictor
::
GetFetch
(
std
::
vector
<
PaddleTensor
>
*
outputs
,
framework
::
Scope
*
scope
)
{
VLOG
(
3
)
<<
"Predictor::get_fetch"
;
VLOG
(
3
0
)
<<
"Predictor::get_fetch"
;
outputs
->
resize
(
fetchs_
.
size
());
for
(
size_t
i
=
0
;
i
<
fetchs_
.
size
();
++
i
)
{
int
idx
=
boost
::
get
<
int
>
(
fetchs_
[
i
]
->
GetAttr
(
"col"
));
...
...
@@ -246,7 +246,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
CHECK
(
argument_
.
transformed_program_desc
);
VLOG
(
5
)
<<
"to prepare executor"
;
VLOG
(
5
0
)
<<
"to prepare executor"
;
inference_program_
.
reset
(
new
framework
::
ProgramDesc
(
*
argument_
.
transformed_program_desc
));
if
(
argument_
.
Has
(
framework
::
ir
::
kParamScopeAttr
))
{
...
...
@@ -260,7 +260,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
AnalysisConfig
,
PaddleEngineKind
::
kAnalysis
>
(
const
AnalysisConfig
&
config
)
{
VLOG
(
3
)
<<
"create AnalysisConfig"
;
VLOG
(
3
0
)
<<
"create AnalysisConfig"
;
if
(
config
.
use_gpu
)
{
// 1. GPU memeroy
PADDLE_ENFORCE_GT
(
...
...
@@ -274,7 +274,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
std
::
string
flag
=
"--fraction_of_gpu_memory_to_use="
+
std
::
to_string
(
config
.
fraction_of_gpu_memory
);
flags
.
push_back
(
flag
);
VLOG
(
3
)
<<
"set flag: "
<<
flag
;
VLOG
(
3
0
)
<<
"set flag: "
<<
flag
;
framework
::
InitGflags
(
flags
);
}
}
...
...
paddle/fluid/inference/api/analysis_predictor.h
浏览文件 @
d231e550
...
...
@@ -13,6 +13,8 @@
// limitations under the License.
#pragma once
#include <algorithm>
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/naive_executor.h"
...
...
paddle/fluid/inference/api/api.cc
浏览文件 @
d231e550
...
...
@@ -16,7 +16,6 @@
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle_inference_api.h"
namespace
paddle
{
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
d231e550
...
...
@@ -157,7 +157,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG
(
ERROR
)
<<
"fail to get fetches"
;
return
false
;
}
VLOG
(
3
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
VLOG
(
3
0
)
<<
"predict cost: "
<<
timer
.
toc
()
<<
"ms"
;
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_
.
CollectTensorArrays
(
scope_
.
get
());
...
...
paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
浏览文件 @
d231e550
...
...
@@ -34,7 +34,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
bool
Init
(
const
std
::
shared_ptr
<
framework
::
Scope
>&
parent_scope
)
{
FLAGS_IA_enable_tensorrt_subgraph_engine
=
true
;
VLOG
(
3
)
<<
"Predictor::init()"
;
VLOG
(
3
0
)
<<
"Predictor::init()"
;
if
(
config_
.
use_gpu
)
{
place_
=
paddle
::
platform
::
CUDAPlace
(
config_
.
device
);
}
else
{
...
...
@@ -70,7 +70,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
OptimizeInferenceProgram
();
ctx_
=
executor_
->
Prepare
(
*
inference_program_
,
0
);
VLOG
(
5
)
<<
"to create variables"
;
VLOG
(
5
0
)
<<
"to create variables"
;
executor_
->
CreateVariables
(
*
inference_program_
,
sub_scope_
?
sub_scope_
:
scope_
.
get
(),
0
);
// Get the feed_target_names and fetch_target_names
...
...
@@ -114,9 +114,9 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
new
ProgramDesc
(
*
inference_program_
->
Proto
()));
Singleton
<
Analyzer
>::
Global
().
Run
(
&
argument
);
CHECK
(
argument
.
transformed_program_desc
);
VLOG
(
5
)
<<
"transformed program:
\n
"
<<
argument
.
transformed_program_desc
->
SerializeAsString
();
VLOG
(
5
)
<<
"to prepare executor"
;
VLOG
(
5
0
)
<<
"transformed program:
\n
"
<<
argument
.
transformed_program_desc
->
SerializeAsString
();
VLOG
(
5
0
)
<<
"to prepare executor"
;
inference_program_
.
reset
(
new
framework
::
ProgramDesc
(
*
argument
.
transformed_program_desc
));
}
...
...
@@ -129,7 +129,7 @@ template <>
std
::
unique_ptr
<
PaddlePredictor
>
CreatePaddlePredictor
<
MixedRTConfig
,
PaddleEngineKind
::
kAutoMixedTensorRT
>
(
const
MixedRTConfig
&
config
)
{
VLOG
(
3
)
<<
"create TensorRTSubgraphPredictor"
;
VLOG
(
3
0
)
<<
"create TensorRTSubgraphPredictor"
;
if
(
config
.
use_gpu
)
{
// 1. GPU memeroy
PADDLE_ENFORCE_GT
(
...
...
@@ -143,7 +143,7 @@ CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
std
::
string
flag
=
"--fraction_of_gpu_memory_to_use="
+
std
::
to_string
(
config
.
fraction_of_gpu_memory
);
flags
.
push_back
(
flag
);
VLOG
(
3
)
<<
"set flag: "
<<
flag
;
VLOG
(
3
0
)
<<
"set flag: "
<<
flag
;
framework
::
InitGflags
(
flags
);
}
}
...
...
paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
浏览文件 @
d231e550
...
...
@@ -45,7 +45,7 @@ void Main() {
config
.
fraction_of_gpu_memory
=
0.1
;
// set by yourself
predictor
=
CreatePaddlePredictor
<
paddle
::
contrib
::
MixedRTConfig
>
(
config
);
VLOG
(
3
)
<<
"begin to process data"
;
VLOG
(
3
0
)
<<
"begin to process data"
;
// Just a single batch of data.
std
::
string
line
;
std
::
ifstream
file
(
FLAGS_data
);
...
...
@@ -60,13 +60,13 @@ void Main() {
PaddleBuf
(
record
.
data
.
data
(),
record
.
data
.
size
()
*
sizeof
(
float
));
input
.
dtype
=
PaddleDType
::
FLOAT32
;
VLOG
(
3
)
<<
"run executor"
;
VLOG
(
3
0
)
<<
"run executor"
;
std
::
vector
<
PaddleTensor
>
output
;
predictor
->
Run
({
input
},
&
output
,
1
);
VLOG
(
3
)
<<
"output.size "
<<
output
.
size
();
VLOG
(
3
0
)
<<
"output.size "
<<
output
.
size
();
auto
&
tensor
=
output
.
front
();
VLOG
(
3
)
<<
"output: "
<<
SummaryTensor
(
tensor
);
VLOG
(
3
0
)
<<
"output: "
<<
SummaryTensor
(
tensor
);
// compare with reference result
CheckOutput
(
FLAGS_refer
,
tensor
);
...
...
paddle/fluid/inference/api/demo_ci/utils.h
浏览文件 @
d231e550
...
...
@@ -47,7 +47,7 @@ static void split(const std::string& str, char sep,
}
Record
ProcessALine
(
const
std
::
string
&
line
)
{
VLOG
(
3
)
<<
"process a line"
;
VLOG
(
3
0
)
<<
"process a line"
;
std
::
vector
<
std
::
string
>
columns
;
split
(
line
,
'\t'
,
&
columns
);
CHECK_EQ
(
columns
.
size
(),
2UL
)
...
...
@@ -65,8 +65,8 @@ Record ProcessALine(const std::string& line) {
for
(
auto
&
s
:
shape_strs
)
{
record
.
shape
.
push_back
(
std
::
stoi
(
s
));
}
VLOG
(
3
)
<<
"data size "
<<
record
.
data
.
size
();
VLOG
(
3
)
<<
"data shape size "
<<
record
.
shape
.
size
();
VLOG
(
3
0
)
<<
"data size "
<<
record
.
data
.
size
();
VLOG
(
3
0
)
<<
"data shape size "
<<
record
.
shape
.
size
();
return
record
;
}
...
...
@@ -78,8 +78,8 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
file
.
close
();
size_t
numel
=
output
.
data
.
length
()
/
PaddleDtypeSize
(
output
.
dtype
);
VLOG
(
3
)
<<
"predictor output numel "
<<
numel
;
VLOG
(
3
)
<<
"reference output numel "
<<
refer
.
data
.
size
();
VLOG
(
3
0
)
<<
"predictor output numel "
<<
numel
;
VLOG
(
3
0
)
<<
"reference output numel "
<<
refer
.
data
.
size
();
CHECK_EQ
(
numel
,
refer
.
data
.
size
());
switch
(
output
.
dtype
)
{
case
PaddleDType
::
INT64
:
{
...
...
paddle/fluid/inference/api/demo_ci/vis_demo.cc
浏览文件 @
d231e550
...
...
@@ -49,11 +49,11 @@ void Main(bool use_gpu) {
config
.
fraction_of_gpu_memory
=
0.1
;
// set by yourself
}
VLOG
(
3
)
<<
"init predictor"
;
VLOG
(
3
0
)
<<
"init predictor"
;
predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
analysis_predictor
=
CreatePaddlePredictor
<
AnalysisConfig
>
(
config
);
VLOG
(
3
)
<<
"begin to process data"
;
VLOG
(
3
0
)
<<
"begin to process data"
;
// Just a single batch of data.
std
::
string
line
;
std
::
ifstream
file
(
FLAGS_data
);
...
...
@@ -68,13 +68,13 @@ void Main(bool use_gpu) {
PaddleBuf
(
record
.
data
.
data
(),
record
.
data
.
size
()
*
sizeof
(
float
));
input
.
dtype
=
PaddleDType
::
FLOAT32
;
VLOG
(
3
)
<<
"run executor"
;
VLOG
(
3
0
)
<<
"run executor"
;
std
::
vector
<
PaddleTensor
>
output
,
analysis_output
;
predictor
->
Run
({
input
},
&
output
,
1
);
VLOG
(
3
)
<<
"output.size "
<<
output
.
size
();
VLOG
(
3
0
)
<<
"output.size "
<<
output
.
size
();
auto
&
tensor
=
output
.
front
();
VLOG
(
3
)
<<
"output: "
<<
SummaryTensor
(
tensor
);
VLOG
(
3
0
)
<<
"output: "
<<
SummaryTensor
(
tensor
);
// compare with reference result
CheckOutput
(
FLAGS_refer
,
tensor
);
...
...
paddle/fluid/inference/api/details/reset_tensor_array.cc
浏览文件 @
d231e550
...
...
@@ -26,7 +26,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
// parameter.
if
(
var_name
==
"feed"
||
var_name
==
"fetch"
)
continue
;
if
(
var
->
Type
()
==
typeid
(
framework
::
LoDTensorArray
))
{
VLOG
(
4
)
<<
"collect "
<<
var_name
;
VLOG
(
4
0
)
<<
"collect "
<<
var_name
;
arrays_
.
push_back
(
var
->
GetMutable
<
framework
::
LoDTensorArray
>
());
}
}
...
...
@@ -34,7 +34,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
CollectTensorArrays
(
kid
);
}
VLOG
(
3
)
<<
"Collect "
<<
arrays_
.
size
()
<<
" arrays"
;
VLOG
(
3
0
)
<<
"Collect "
<<
arrays_
.
size
()
<<
" arrays"
;
flag_
=
false
;
}
}
...
...
paddle/fluid/inference/api/helper.h
浏览文件 @
d231e550
...
...
@@ -16,13 +16,14 @@
#include <glog/logging.h>
#include <sys/time.h>
#include <algorithm>
#include <chrono> // NOLINT
#include <numeric>
#include <sstream>
#include <string>
#include <vector>
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/string/printf.h"
#include "paddle_inference_api.h"
namespace
paddle
{
namespace
inference
{
...
...
paddle/fluid/inference/io.cc
浏览文件 @
d231e550
...
...
@@ -59,7 +59,8 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
bool
IsPersistable
(
const
framework
::
VarDesc
*
var
)
{
if
(
var
->
Persistable
()
&&
var
->
GetType
()
!=
framework
::
proto
::
VarType
::
FEED_MINIBATCH
&&
var
->
GetType
()
!=
framework
::
proto
::
VarType
::
FETCH_LIST
)
{
var
->
GetType
()
!=
framework
::
proto
::
VarType
::
FETCH_LIST
&&
var
->
GetType
()
!=
framework
::
proto
::
VarType
::
RAW
)
{
return
true
;
}
return
false
;
...
...
@@ -77,7 +78,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
for
(
auto
*
var
:
global_block
.
AllVars
())
{
if
(
IsPersistable
(
var
))
{
VLOG
(
3
)
<<
"persistable variable's name: "
<<
var
->
Name
();
VLOG
(
3
0
)
<<
"persistable variable's name: "
<<
var
->
Name
();
framework
::
VarDesc
*
new_var
=
load_block
->
Var
(
var
->
Name
());
new_var
->
SetShape
(
var
->
GetShape
());
...
...
@@ -120,7 +121,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
const
std
::
string
&
dirname
)
{
std
::
string
model_filename
=
dirname
+
"/__model__"
;
std
::
string
program_desc_str
;
VLOG
(
3
)
<<
"loading model from "
<<
model_filename
;
VLOG
(
3
0
)
<<
"loading model from "
<<
model_filename
;
ReadBinaryFile
(
model_filename
,
&
program_desc_str
);
std
::
unique_ptr
<
framework
::
ProgramDesc
>
main_program
(
...
...
paddle/fluid/inference/tensorrt/convert/concat_op.cc
浏览文件 @
d231e550
...
...
@@ -25,7 +25,7 @@ class ConcatOpConverter : public OpConverter {
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
4
)
<<
"convert a fluid mul op to tensorrt mul layer without bias"
;
VLOG
(
4
0
)
<<
"convert a fluid mul op to tensorrt mul layer without bias"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
...
...
paddle/fluid/inference/tensorrt/convert/dropout_op.cc
浏览文件 @
d231e550
...
...
@@ -25,7 +25,7 @@ class DropoutOpConverter : public OpConverter {
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
4
)
<<
"convert a fluid dropout op to tensorrt dropout layer"
;
VLOG
(
4
0
)
<<
"convert a fluid dropout op to tensorrt dropout layer"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
auto
*
input1
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"X"
)[
0
]);
...
...
paddle/fluid/inference/tensorrt/convert/fc_op.cc
浏览文件 @
d231e550
...
...
@@ -52,7 +52,7 @@ class FcOpConverter : public OpConverter {
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
4
)
<<
"convert a fluid fc op to tensorrt fc layer without bias"
;
VLOG
(
4
0
)
<<
"convert a fluid fc op to tensorrt fc layer without bias"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
PADDLE_ENFORCE_EQ
(
op_desc
.
Input
(
"X"
).
size
(),
1
);
...
...
paddle/fluid/inference/tensorrt/convert/mul_op.cc
浏览文件 @
d231e550
...
...
@@ -25,7 +25,7 @@ class MulOpConverter : public OpConverter {
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
4
)
<<
"convert a fluid mul op to tensorrt mul layer without bias"
;
VLOG
(
4
0
)
<<
"convert a fluid mul op to tensorrt mul layer without bias"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
...
...
paddle/fluid/inference/tensorrt/convert/pad_op.cc
浏览文件 @
d231e550
...
...
@@ -25,7 +25,7 @@ class PadOpConverter : public OpConverter {
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
4
)
<<
"convert a fluid transpose op to tensorrt tranpose layer"
;
VLOG
(
4
0
)
<<
"convert a fluid transpose op to tensorrt tranpose layer"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
...
...
paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
浏览文件 @
d231e550
...
...
@@ -25,7 +25,7 @@ class Pool2dOpConverter : public OpConverter {
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
4
)
VLOG
(
4
0
)
<<
"convert a fluid pool2d op to tensorrt pool2d layer without bias"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
...
...
paddle/fluid/inference/tensorrt/convert/softmax_op.cc
浏览文件 @
d231e550
...
...
@@ -25,7 +25,7 @@ class SoftMaxOpConverter : public OpConverter {
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
4
)
VLOG
(
4
0
)
<<
"convert a fluid softmax op to tensorrt softmax layer without bias"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
...
...
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
d231e550
...
...
@@ -134,7 +134,7 @@ class TensorRTEngine : public EngineBase {
std
::
unordered_map
<
std
::
string
/*name*/
,
std
::
unique_ptr
<
framework
::
Tensor
>>
weight_map
;
// TODO
:
(NHZLX)
// TODO(NHZLX)
// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
...
...
paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
浏览文件 @
d231e550
...
...
@@ -27,7 +27,7 @@ struct Record {
};
Record
ProcessALine
(
const
std
::
string
&
line
)
{
VLOG
(
3
)
<<
"process a line"
;
VLOG
(
3
0
)
<<
"process a line"
;
std
::
vector
<
std
::
string
>
columns
;
split
(
line
,
'\t'
,
&
columns
);
CHECK_EQ
(
columns
.
size
(),
2UL
)
...
...
@@ -45,8 +45,8 @@ Record ProcessALine(const std::string &line) {
for
(
auto
&
s
:
shape_strs
)
{
record
.
shape
.
push_back
(
std
::
stoi
(
s
));
}
VLOG
(
3
)
<<
"data size "
<<
record
.
data
.
size
();
VLOG
(
3
)
<<
"data shape size "
<<
record
.
shape
.
size
();
VLOG
(
3
0
)
<<
"data size "
<<
record
.
data
.
size
();
VLOG
(
3
0
)
<<
"data shape size "
<<
record
.
shape
.
size
();
return
record
;
}
...
...
paddle/fluid/memory/detail/buddy_allocator.cc
浏览文件 @
d231e550
...
...
@@ -32,11 +32,11 @@ BuddyAllocator::BuddyAllocator(
system_allocator_
(
std
::
move
(
system_allocator
))
{}
BuddyAllocator
::~
BuddyAllocator
()
{
VLOG
(
10
)
<<
"BuddyAllocator Disconstructor makes sure that all of these "
"have actually been freed"
;
VLOG
(
10
0
)
<<
"BuddyAllocator Disconstructor makes sure that all of these "
"have actually been freed"
;
while
(
!
pool_
.
empty
())
{
auto
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
pool_
.
begin
()));
VLOG
(
10
)
<<
"Free from block ("
<<
block
<<
", "
<<
max_chunk_size_
<<
")"
;
VLOG
(
10
0
)
<<
"Free from block ("
<<
block
<<
", "
<<
max_chunk_size_
<<
")"
;
system_allocator_
->
Free
(
block
,
max_chunk_size_
,
block
->
index
(
cache_
));
cache_
.
invalidate
(
block
);
...
...
@@ -57,12 +57,12 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// acquire the allocator lock
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
VLOG
(
10
)
<<
"Allocate "
<<
unaligned_size
<<
" bytes from chunk size "
<<
size
;
VLOG
(
10
0
)
<<
"Allocate "
<<
unaligned_size
<<
" bytes from chunk size "
<<
size
;
// if the allocation is huge, send directly to the system allocator
if
(
size
>
max_chunk_size_
)
{
VLOG
(
10
)
<<
"Allocate from system allocator."
;
VLOG
(
10
0
)
<<
"Allocate from system allocator."
;
return
SystemAlloc
(
size
);
}
...
...
@@ -77,9 +77,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
return
nullptr
;
}
}
else
{
VLOG
(
10
)
<<
"Allocation from existing memory block "
<<
std
::
get
<
2
>
(
*
it
)
<<
" at address "
<<
reinterpret_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
it
))
->
data
();
VLOG
(
10
0
)
<<
"Allocation from existing memory block "
<<
std
::
get
<
2
>
(
*
it
)
<<
" at address "
<<
reinterpret_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
it
))
->
data
();
}
total_used_
+=
size
;
...
...
@@ -96,10 +96,10 @@ void BuddyAllocator::Free(void* p) {
// Acquire the allocator lock
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
VLOG
(
10
)
<<
"Free from address "
<<
block
;
VLOG
(
10
0
)
<<
"Free from address "
<<
block
;
if
(
block
->
type
(
cache_
)
==
MemoryBlock
::
HUGE_CHUNK
)
{
VLOG
(
10
)
<<
"Free directly from system allocator"
;
VLOG
(
10
0
)
<<
"Free directly from system allocator"
;
system_allocator_
->
Free
(
block
,
block
->
total_size
(
cache_
),
block
->
index
(
cache_
));
...
...
@@ -116,8 +116,8 @@ void BuddyAllocator::Free(void* p) {
// Trying to merge the right buddy
if
(
block
->
has_right_buddy
(
cache_
))
{
VLOG
(
10
)
<<
"Merging this block "
<<
block
<<
" with its right buddy "
<<
block
->
right_buddy
(
cache_
);
VLOG
(
10
0
)
<<
"Merging this block "
<<
block
<<
" with its right buddy "
<<
block
->
right_buddy
(
cache_
);
auto
right_buddy
=
block
->
right_buddy
(
cache_
);
...
...
@@ -134,8 +134,8 @@ void BuddyAllocator::Free(void* p) {
// Trying to merge the left buddy
if
(
block
->
has_left_buddy
(
cache_
))
{
VLOG
(
10
)
<<
"Merging this block "
<<
block
<<
" with its left buddy "
<<
block
->
left_buddy
(
cache_
);
VLOG
(
10
0
)
<<
"Merging this block "
<<
block
<<
" with its left buddy "
<<
block
->
left_buddy
(
cache_
);
auto
left_buddy
=
block
->
left_buddy
(
cache_
);
...
...
@@ -151,8 +151,8 @@ void BuddyAllocator::Free(void* p) {
}
// Dumping this block into pool
VLOG
(
10
)
<<
"Inserting free block ("
<<
block
<<
", "
<<
block
->
total_size
(
cache_
)
<<
")"
;
VLOG
(
10
0
)
<<
"Inserting free block ("
<<
block
<<
", "
<<
block
->
total_size
(
cache_
)
<<
")"
;
pool_
.
insert
(
IndexSizeAddress
(
block
->
index
(
cache_
),
block
->
total_size
(
cache_
),
block
));
...
...
@@ -174,7 +174,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
size_t
index
=
0
;
void
*
p
=
system_allocator_
->
Alloc
(
&
index
,
size
);
VLOG
(
10
)
<<
"Allocated "
<<
p
<<
" from system allocator."
;
VLOG
(
10
0
)
<<
"Allocated "
<<
p
<<
" from system allocator."
;
if
(
p
==
nullptr
)
return
nullptr
;
...
...
@@ -200,8 +200,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
if
(
p
==
nullptr
)
return
pool_
.
end
();
VLOG
(
10
)
<<
"Creating and inserting new block "
<<
p
<<
" from system allocator"
;
VLOG
(
10
0
)
<<
"Creating and inserting new block "
<<
p
<<
" from system allocator"
;
static_cast
<
MemoryBlock
*>
(
p
)
->
init
(
&
cache_
,
MemoryBlock
::
FREE_CHUNK
,
index
,
max_chunk_size_
,
nullptr
,
nullptr
);
...
...
@@ -245,19 +245,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
auto
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
it
));
pool_
.
erase
(
it
);
VLOG
(
10
)
<<
"Split block ("
<<
block
<<
", "
<<
block
->
total_size
(
cache_
)
<<
") into"
;
VLOG
(
10
0
)
<<
"Split block ("
<<
block
<<
", "
<<
block
->
total_size
(
cache_
)
<<
") into"
;
block
->
split
(
&
cache_
,
size
);
VLOG
(
10
)
<<
"Left block ("
<<
block
<<
", "
<<
block
->
total_size
(
cache_
)
<<
")"
;
VLOG
(
10
0
)
<<
"Left block ("
<<
block
<<
", "
<<
block
->
total_size
(
cache_
)
<<
")"
;
block
->
set_type
(
&
cache_
,
MemoryBlock
::
ARENA_CHUNK
);
// the rest of memory if exist
if
(
block
->
has_right_buddy
(
cache_
))
{
if
(
block
->
right_buddy
(
cache_
)
->
type
(
cache_
)
==
MemoryBlock
::
FREE_CHUNK
)
{
VLOG
(
10
)
<<
"Insert right block ("
<<
block
->
right_buddy
(
cache_
)
<<
", "
<<
block
->
right_buddy
(
cache_
)
->
total_size
(
cache_
)
<<
")"
;
VLOG
(
10
0
)
<<
"Insert right block ("
<<
block
->
right_buddy
(
cache_
)
<<
", "
<<
block
->
right_buddy
(
cache_
)
->
total_size
(
cache_
)
<<
")"
;
pool_
.
insert
(
IndexSizeAddress
(
block
->
right_buddy
(
cache_
)
->
index
(
cache_
),
...
...
@@ -284,7 +284,7 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
return
;
}
VLOG
(
10
)
<<
"Return block "
<<
block
<<
" to fallback allocator."
;
VLOG
(
10
0
)
<<
"Return block "
<<
block
<<
" to fallback allocator."
;
system_allocator_
->
Free
(
block
,
max_chunk_size_
,
block
->
index
(
cache_
));
cache_
.
invalidate
(
block
);
...
...
@@ -320,7 +320,7 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
MemoryBlock
*
block
=
static_cast
<
MemoryBlock
*>
(
std
::
get
<
2
>
(
*
pool
));
VLOG
(
10
)
<<
"Return block "
<<
block
<<
" to base allocator."
;
VLOG
(
10
0
)
<<
"Return block "
<<
block
<<
" to base allocator."
;
system_allocator_
->
Free
(
block
,
max_chunk_size_
,
block
->
index
(
cache_
));
cache_
.
invalidate
(
block
);
...
...
paddle/fluid/memory/detail/meta_cache.cc
浏览文件 @
d231e550
...
...
@@ -29,7 +29,7 @@ MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
return
existing_desc
->
second
;
}
else
{
auto
*
desc
=
reinterpret_cast
<
const
MemoryBlock
::
Desc
*>
(
block
);
VLOG
(
10
)
<<
"Load MemoryBlock::Desc type="
<<
desc
->
type
;
VLOG
(
10
0
)
<<
"Load MemoryBlock::Desc type="
<<
desc
->
type
;
PADDLE_ASSERT
(
desc
->
check_guards
());
return
*
reinterpret_cast
<
const
MemoryBlock
::
Desc
*>
(
block
);
}
...
...
paddle/fluid/memory/malloc.cc
浏览文件 @
d231e550
...
...
@@ -78,7 +78,7 @@ void* Alloc<platform::CPUPlace>(const platform::CPUPlace& place, size_t size) {
if
(
FLAGS_init_allocated_mem
)
{
memset
(
p
,
0xEF
,
size
);
}
VLOG
(
10
)
<<
" pointer="
<<
p
;
VLOG
(
10
0
)
<<
" pointer="
<<
p
;
return
p
;
}
...
...
@@ -111,12 +111,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
GPUAllocator
(
i
)),
platform
::
GpuMinChunkSize
(),
platform
::
GpuMaxChunkSize
());
VLOG
(
10
)
<<
"
\n\n
NOTE: each GPU device use "
<<
FLAGS_fraction_of_gpu_memory_to_use
*
100
<<
"% of GPU memory.
\n
"
<<
"You can set GFlags environment variable '"
<<
"FLAGS_fraction_of_gpu_memory_to_use"
<<
"' to change the fraction of GPU usage.
\n\n
"
;
VLOG
(
10
0
)
<<
"
\n\n
NOTE: each GPU device use "
<<
FLAGS_fraction_of_gpu_memory_to_use
*
100
<<
"% of GPU memory.
\n
"
<<
"You can set GFlags environment variable '"
<<
"FLAGS_fraction_of_gpu_memory_to_use"
<<
"' to change the fraction of GPU usage.
\n\n
"
;
}
});
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
d231e550
...
...
@@ -317,6 +317,7 @@ op_library(save_op DEPS lod_tensor)
op_library
(
load_op DEPS lod_tensor
)
op_library
(
save_combine_op DEPS lod_tensor
)
op_library
(
load_combine_op DEPS lod_tensor
)
op_library
(
tensor_array_to_tensor_op DEPS concat_op
)
op_library
(
concat_op DEPS concat_and_split
)
list
(
REMOVE_ITEM GENERAL_OPS
${
DEPS_OPS
}
)
...
...
paddle/fluid/operators/activation_op.cc
浏览文件 @
d231e550
...
...
@@ -91,16 +91,12 @@ class ActivationOp : public framework::OperatorWithKernel {
}
};
class
ActivationOpInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
x_name
=
op_desc
.
Input
(
"X"
)[
0
];
auto
out_name
=
op_desc
.
Output
(
"Out"
)[
0
];
auto
&
x
=
block
->
FindRecursiveOrCreateVar
(
x_name
);
auto
&
out
=
block
->
FindRecursiveOrCreateVar
(
out_name
);
out
.
SetType
(
x
.
GetType
());
out
.
SetDataType
(
x
.
GetDataType
());
class
ActivationOpInferVarType
:
public
framework
::
PassInDtypeAndVarTypeToOutput
{
protected:
std
::
unordered_map
<
std
::
string
,
std
::
string
>
GetInputOutputWithSameType
()
const
override
{
return
std
::
unordered_map
<
std
::
string
,
std
::
string
>
{{
"X"
,
/*->*/
"Out"
}};
}
};
...
...
paddle/fluid/operators/activation_op.h
浏览文件 @
d231e550
...
...
@@ -95,7 +95,7 @@ class ActivationGradKernel
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
X
);
functor
(
*
place
,
x
,
out
,
dout
,
dx
);
}
else
{
VLOG
(
10
)
<<
" Inplace activation "
;
VLOG
(
10
0
)
<<
" Inplace activation "
;
auto
x
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
dX
);
functor
(
*
place
,
x
,
out
,
dout
,
dx
);
}
...
...
paddle/fluid/operators/adam_op.h
浏览文件 @
d231e550
...
...
@@ -297,7 +297,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
auto
&
grad
=
Ref
(
ctx
.
Input
<
framework
::
SelectedRows
>
(
"Grad"
),
"Must set Grad"
);
if
(
grad
.
rows
().
size
()
==
0
)
{
VLOG
(
3
)
<<
"grad row size is 0!!"
;
VLOG
(
3
0
)
<<
"grad row size is 0!!"
;
return
;
}
...
...
paddle/fluid/operators/add_position_encoding_op.h
浏览文件 @
d231e550
...
...
@@ -66,9 +66,10 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> {
x_lod
.
empty
()
?
max_seq_len
:
x_lod
[
0
][
i
+
1
]
-
x_lod
[
0
][
i
];
for
(
int
j
=
0
;
j
<
max_length
;
++
j
)
{
for
(
int
k
=
0
;
k
<
half_size
;
++
k
)
{
const
double
val
=
(
half_size
>
1
)
?
j
/
pow
(
10000.0
,
double
(
k
)
/
(
half_size
-
1
))
:
j
/
10000.0
;
const
double
val
=
(
half_size
>
1
)
?
j
/
pow
(
10000.0
,
static_cast
<
double
>
(
k
)
/
(
half_size
-
1
))
:
j
/
10000.0
;
dst_ptr
[
k
]
=
src_ptr
[
k
]
*
alpha
+
sin
(
val
)
*
beta
;
dst_ptr
[
half_size
+
k
]
=
src_ptr
[
half_size
+
k
]
*
alpha
+
cos
(
val
)
*
beta
;
...
...
paddle/fluid/operators/array_operator.h
浏览文件 @
d231e550
...
...
@@ -49,7 +49,7 @@ class ArrayOp : public framework::OperatorBase {
}
else
{
offset
=
static_cast
<
size_t
>
(
*
i_tensor
.
data
<
int64_t
>
());
}
VLOG
(
10
)
<<
" Offset = "
<<
offset
;
VLOG
(
10
0
)
<<
" Offset = "
<<
offset
;
return
offset
;
}
};
...
...
paddle/fluid/operators/array_to_lod_tensor_op.cc
浏览文件 @
d231e550
...
...
@@ -148,8 +148,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
size_t
start_offset
=
lod_and_offset
.
second
.
first
;
size_t
end_offset
=
lod_and_offset
.
second
.
second
;
VLOG
(
10
)
<<
"idx="
<<
idx
<<
" x_idx="
<<
x_idx
<<
" ["
<<
", "
<<
end_offset
<<
"]"
;
VLOG
(
10
0
)
<<
"idx="
<<
idx
<<
" x_idx="
<<
x_idx
<<
" ["
<<
", "
<<
end_offset
<<
"]"
;
// Copy data
PADDLE_ENFORCE_GE
(
end_offset
,
start_offset
);
size_t
len
=
end_offset
-
start_offset
;
...
...
paddle/fluid/operators/batch_norm_op.cc
浏览文件 @
d231e550
...
...
@@ -170,6 +170,15 @@ The required data format for this layer is one of the following:
}
};
class
BatchNormOpInferVarType
:
public
framework
::
PassInDtypeAndVarTypeToOutput
{
protected:
std
::
unordered_map
<
std
::
string
,
std
::
string
>
GetInputOutputWithSameType
()
const
override
{
return
std
::
unordered_map
<
std
::
string
,
std
::
string
>
{{
"X"
,
/*->*/
"Y"
}};
}
};
template
<
typename
T
>
class
BatchNormKernel
<
platform
::
CPUDeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
...
...
@@ -525,7 +534,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
batch_norm
,
ops
::
BatchNormOp
,
ops
::
BatchNormOpMaker
,
ops
::
BatchNormGradMaker
);
ops
::
BatchNorm
OpInferVarType
,
ops
::
BatchNorm
GradMaker
);
REGISTER_OPERATOR
(
batch_norm_grad
,
ops
::
BatchNormGradOp
);
REGISTER_OP_CPU_KERNEL
(
...
...
paddle/fluid/operators/batch_norm_op.cu.cc
浏览文件 @
d231e550
...
...
@@ -96,7 +96,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
mode_
=
CUDNN_BATCHNORM_SPATIAL
;
#endif
VLOG
(
3
)
<<
"Setting descriptors."
;
VLOG
(
3
0
)
<<
"Setting descriptors."
;
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
strides
;
if
(
data_layout
==
DataLayout
::
kNCHW
)
{
...
...
paddle/fluid/operators/beam_search_op.cc
浏览文件 @
d231e550
...
...
@@ -33,11 +33,11 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
auto
items
=
SelectTopBeamSizeItems
(
pre_ids
,
pre_scores
);
auto
selected_items
=
ToMap
(
items
,
high_level
.
back
());
VLOG
(
3
)
<<
"selected_items:"
;
VLOG
(
3
0
)
<<
"selected_items:"
;
for
(
size_t
i
=
0
;
i
<
selected_items
.
size
();
++
i
)
{
VLOG
(
3
)
<<
"offset:"
<<
i
;
VLOG
(
3
0
)
<<
"offset:"
<<
i
;
for
(
auto
&
item
:
selected_items
[
i
])
{
VLOG
(
3
)
<<
ItemToString
(
item
);
VLOG
(
3
0
)
<<
ItemToString
(
item
);
}
}
...
...
@@ -138,11 +138,11 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
}
result
.
emplace_back
(
items
);
}
VLOG
(
3
)
<<
"SelectTopBeamSizeItems result size "
<<
result
.
size
();
VLOG
(
3
0
)
<<
"SelectTopBeamSizeItems result size "
<<
result
.
size
();
for
(
auto
&
items
:
result
)
{
VLOG
(
3
)
<<
"item set:"
;
VLOG
(
3
0
)
<<
"item set:"
;
for
(
auto
&
item
:
items
)
{
VLOG
(
3
)
<<
ItemToString
(
item
);
VLOG
(
3
0
)
<<
ItemToString
(
item
);
}
}
...
...
paddle/fluid/operators/bilinear_interp_op.h
已删除
100644 → 0
浏览文件 @
cf8d2e67
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
>
class
BilinearInterpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input_t
=
ctx
.
Input
<
Tensor
>
(
"X"
);
// float tensor
auto
*
output_t
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
// float tensor
auto
out_dims
=
output_t
->
dims
();
auto
*
input
=
input_t
->
data
<
T
>
();
int
out_h
=
ctx
.
Attr
<
int
>
(
"out_h"
);
int
out_w
=
ctx
.
Attr
<
int
>
(
"out_w"
);
auto
out_size_t
=
ctx
.
Input
<
Tensor
>
(
"OutSize"
);
if
(
out_size_t
!=
nullptr
)
{
auto
out_size_data
=
out_size_t
->
data
<
int
>
();
out_h
=
out_size_data
[
0
];
out_w
=
out_size_data
[
1
];
}
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
{
out_dims
[
0
],
out_dims
[
1
],
out_h
,
out_w
},
ctx
.
GetPlace
());
int
batch_size
=
input_t
->
dims
()[
0
];
int
channels
=
input_t
->
dims
()[
1
];
int
in_h
=
input_t
->
dims
()[
2
];
int
in_w
=
input_t
->
dims
()[
3
];
int
in_hw
=
in_h
*
in_w
;
int
out_hw
=
out_h
*
out_w
;
int
in_chw
=
channels
*
in_hw
;
int
out_chw
=
channels
*
out_hw
;
float
ratio_h
=
(
out_h
>
1
)
?
static_cast
<
float
>
(
in_h
-
1
)
/
(
out_h
-
1
)
:
0.
f
;
float
ratio_w
=
(
out_w
>
1
)
?
static_cast
<
float
>
(
in_w
-
1
)
/
(
out_w
-
1
)
:
0.
f
;
if
(
in_h
==
out_h
&&
in_w
==
out_w
)
{
memcpy
(
output
,
input
,
input_t
->
numel
()
*
sizeof
(
T
));
}
else
{
for
(
int
k
=
0
;
k
<
batch_size
;
++
k
)
{
// loop for batches
for
(
int
i
=
0
;
i
<
out_h
;
++
i
)
{
// loop for images
int
h
=
ratio_h
*
i
;
int
hid
=
(
h
<
in_h
-
1
)
?
1
:
0
;
float
h1lambda
=
ratio_h
*
i
-
h
;
float
h2lambda
=
1.
f
-
h1lambda
;
for
(
int
j
=
0
;
j
<
out_w
;
++
j
)
{
int
w
=
ratio_w
*
j
;
int
wid
=
(
w
<
in_w
-
1
)
?
1
:
0
;
float
w1lambda
=
ratio_w
*
j
-
w
;
float
w2lambda
=
1.
f
-
w1lambda
;
// calculate four position for bilinear interpolation
const
T
*
in_pos
=
&
input
[
k
*
in_chw
+
h
*
in_w
+
w
];
T
*
out_pos
=
&
output
[
k
*
out_chw
+
i
*
out_w
+
j
];
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
// loop for channels
// bilinear interpolation
out_pos
[
0
]
=
static_cast
<
T
>
(
h2lambda
*
(
w2lambda
*
in_pos
[
0
]
+
w1lambda
*
in_pos
[
wid
])
+
h1lambda
*
(
w2lambda
*
in_pos
[
hid
*
in_w
]
+
w1lambda
*
in_pos
[
hid
*
in_w
+
wid
]));
in_pos
+=
in_hw
;
out_pos
+=
out_hw
;
}
}
}
}
}
}
};
template
<
typename
T
>
class
BilinearInterpGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
d_input_t
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_output_t
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_output
=
d_output_t
->
data
<
T
>
();
auto
*
d_input
=
d_input_t
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
device_ctx
=
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>();
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
zero
;
zero
(
device_ctx
,
d_input_t
,
static_cast
<
T
>
(
0.0
));
int
out_h
=
ctx
.
Attr
<
int
>
(
"out_h"
);
int
out_w
=
ctx
.
Attr
<
int
>
(
"out_w"
);
auto
out_size_t
=
ctx
.
Input
<
Tensor
>
(
"OutSize"
);
if
(
out_size_t
!=
nullptr
)
{
auto
out_size_data
=
out_size_t
->
data
<
int
>
();
out_h
=
out_size_data
[
0
];
out_w
=
out_size_data
[
1
];
}
int
batch_size
=
d_input_t
->
dims
()[
0
];
int
channels
=
d_input_t
->
dims
()[
1
];
int
in_h
=
d_input_t
->
dims
()[
2
];
int
in_w
=
d_input_t
->
dims
()[
3
];
int
in_hw
=
in_h
*
in_w
;
int
out_hw
=
out_h
*
out_w
;
int
in_chw
=
channels
*
in_hw
;
int
out_chw
=
channels
*
out_hw
;
float
ratio_h
=
(
out_h
>
1
)
?
static_cast
<
float
>
(
in_h
-
1
)
/
(
out_h
-
1
)
:
0.
f
;
float
ratio_w
=
(
out_w
>
1
)
?
static_cast
<
float
>
(
in_w
-
1
)
/
(
out_w
-
1
)
:
0.
f
;
if
(
in_h
==
out_h
&&
in_w
==
out_w
)
{
memcpy
(
d_input
,
d_output
,
d_input_t
->
numel
()
*
sizeof
(
T
));
}
else
{
for
(
int
k
=
0
;
k
<
batch_size
;
++
k
)
{
// loop for batches
for
(
int
i
=
0
;
i
<
out_h
;
++
i
)
{
// loop for images
int
h
=
ratio_h
*
i
;
int
hid
=
(
h
<
in_h
-
1
)
?
1
:
0
;
float
h1lambda
=
ratio_h
*
i
-
h
;
float
h2lambda
=
1
-
h1lambda
;
for
(
int
j
=
0
;
j
<
out_w
;
++
j
)
{
int
w
=
ratio_w
*
j
;
int
wid
=
(
w
<
in_w
-
1
)
?
1
:
0
;
float
w1lambda
=
ratio_w
*
j
-
w
;
float
w2lambda
=
1
-
w1lambda
;
T
*
in_pos
=
&
d_input
[
k
*
in_chw
+
h
*
in_w
+
w
];
const
T
*
out_pos
=
&
d_output
[
k
*
out_chw
+
i
*
out_w
+
j
];
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
// loop for channels
in_pos
[
0
]
+=
static_cast
<
T
>
(
h2lambda
*
w2lambda
*
out_pos
[
0
]);
in_pos
[
wid
]
+=
static_cast
<
T
>
(
h2lambda
*
w1lambda
*
out_pos
[
0
]);
in_pos
[
hid
*
in_w
]
+=
static_cast
<
T
>
(
h1lambda
*
w2lambda
*
out_pos
[
0
]);
in_pos
[
hid
*
in_w
+
wid
]
+=
static_cast
<
T
>
(
h1lambda
*
w1lambda
*
out_pos
[
0
]);
in_pos
+=
in_hw
;
out_pos
+=
out_hw
;
}
}
}
}
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/checkpoint_notify_op.cc
浏览文件 @
d231e550
...
...
@@ -46,8 +46,8 @@ class CheckpointNotifyOp : public framework::OperatorBase {
auto
lookup_table_save_dir
=
string
::
Sprintf
(
"%s/%s_%d"
,
dir
,
lookup_table_name
,
i
);
rpc_client
->
AsyncCheckpointNotify
(
epmap
[
i
],
lookup_table_save_dir
);
VLOG
(
3
)
<<
"checkpoint notify sending lookup table: "
<<
lookup_table_name
<<
" and dir:"
<<
dir
<<
" to "
<<
epmap
[
i
];
VLOG
(
3
0
)
<<
"checkpoint notify sending lookup table: "
<<
lookup_table_name
<<
" and dir:"
<<
dir
<<
" to "
<<
epmap
[
i
];
}
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
}
...
...
paddle/fluid/operators/concat_op.cc
浏览文件 @
d231e550
...
...
@@ -37,7 +37,7 @@ class ConcatOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_GT
(
n
,
0
,
"Input tensors count should > 0."
);
if
(
n
==
1
)
{
VLOG
(
3
)
<<
"Warning: concat op have only one input, may waste memory"
;
VLOG
(
3
0
)
<<
"Warning: concat op have only one input, may waste memory"
;
}
auto
out_dims
=
ins
[
0
];
...
...
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
d231e550
...
...
@@ -15,15 +15,22 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_bool
(
cudnn_deterministic
,
false
,
"Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If "
"true, the algorithm is deterministic."
);
DEFINE_uint64
(
conv_workspace_size_limit
,
4096
,
"cuDNN convolution workspace limit in MB unit."
);
DEFINE_bool
(
cudnn_exhaustive_search
,
false
,
"Whether enable exhaustive search for cuDNN convolution or "
"not, defalut is False."
);
namespace
paddle
{
namespace
operators
{
...
...
@@ -36,13 +43,25 @@ using DataLayout = platform::DataLayout;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
static
constexpr
char
kCUDNNFwdAlgoCache
[]
=
"kCUDNNFwdAlgoCache"
;
static
constexpr
char
kCUDNNBwdDataAlgoCache
[]
=
"kCUDNNBwdDataAlgoCache"
;
static
constexpr
char
kCUDNNBwdFilterAlgoCache
[]
=
"kCUDNNBwdFilterAlgoCache"
;
static
constexpr
size_t
kCONV_CUDNN_WORKSPACE_LIMIT_BYTES
=
static_cast
<
size_t
>
(
1024
)
*
1024
*
1024
;
static
constexpr
size_t
kNUM_CUDNN_FWD_ALGS
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT
;
static
constexpr
size_t
kNUM_CUDNN_BWD_FILTER_ALGS
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT
;
static
constexpr
size_t
kNUM_CUDNN_BWD_DATA_ALGS
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT
;
template
<
typename
T
>
class
CUDNNConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"It must use CUDAPlace."
);
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
...
...
@@ -55,6 +74,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
int64_t
user_workspace_size
=
static_cast
<
size_t
>
(
ctx
.
Attr
<
int
>
(
"workspace_size_MB"
));
bool
exhaustive_search
=
FLAGS_cudnn_exhaustive_search
||
ctx
.
Attr
<
bool
>
(
"exhaustive_search"
);
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
filter_data
=
filter
->
data
<
T
>
();
...
...
@@ -120,19 +141,19 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv workspace ---------------------
size_t
workspace_size_in_bytes
;
// final workspace to allocate.
size_t
workspace_size_limit
=
kCONV_CUDNN_WORKSPACE_LIMIT_BYTES
;
if
(
user_workspace_size
>
0
)
{
workspace_size_limit
=
user_workspace_size
*
1024
*
1024
;
if
(
FLAGS_conv_workspace_size_limit
>
0
||
user_workspace_size
>
0
)
{
int64_t
max_user_size
=
std
::
max
(
static_cast
<
int64_t
>
(
FLAGS_conv_workspace_size_limit
),
user_workspace_size
);
workspace_size_limit
=
max_user_size
*
1024
*
1024
;
}
// ------------------- cudnn conv algorithm ---------------------
cudnnConvolutionFwdAlgo_t
algo
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
bool
half_float
=
false
;
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
...
...
@@ -143,14 +164,66 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
// Currently tensor core is only enabled using this algo
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
;
VLOG
(
5
)
<<
"use cudnn_tensor_op_math"
;
half_float
=
true
;
VLOG
(
50
)
<<
"use cudnn_tensor_op_math"
;
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
VLOG
(
5
)
<<
"NOT use cudnn_tensor_op_math"
;
VLOG
(
5
0
)
<<
"NOT use cudnn_tensor_op_math"
;
}
#endif
auto
x_dims
=
framework
::
vectorize
(
input
->
dims
());
auto
f_dims
=
framework
::
vectorize
(
filter
->
dims
());
if
((
!
exhaustive_search
)
&&
(
!
half_float
))
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
VLOG
(
3
)
<<
"cuDNN forward algo "
<<
algo
;
}
else
if
(
exhaustive_search
&&
(
!
half_float
))
{
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>*
algo_cache
=
nullptr
;
if
(
ctx
.
scope
().
FindVar
(
kCUDNNFwdAlgoCache
))
{
algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
else
{
algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNFwdAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionFwdAlgo_t
>>
();
}
algo
=
algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionFwdAlgoPerf_t
,
kNUM_CUDNN_FWD_ALGS
>
fwd_perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_filter_desc
,
filter_data
,
cudnn_conv_desc
,
cudnn_output_desc
,
output_data
,
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
fwd_perf_stat
.
data
(),
cudnn_workspace
,
workspace_size_limit
));
};
workspace_handle
.
RunFunc
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
fwd_perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
algo
<<
": "
<<
stat
.
status
<<
" "
<<
stat
.
time
<<
" "
<<
stat
.
memory
;
}
return
fwd_perf_stat
[
0
].
algo
;
});
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
}
else
{
PADDLE_ENFORCE
(
half_float
,
"cuDNN exhaustive search doesn't support half float."
);
}
// get workspace size able to allocate
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
...
...
@@ -162,7 +235,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv forward ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
auto
cudnn_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
...
...
@@ -180,6 +252,7 @@ template <typename T>
class
CUDNNConvGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"It must use CUDAPlace."
);
auto
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
...
...
@@ -198,6 +271,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
int64_t
user_workspace_size
=
static_cast
<
size_t
>
(
ctx
.
Attr
<
int
>
(
"workspace_size_MB"
));
bool
exhaustive_search
=
FLAGS_cudnn_exhaustive_search
||
ctx
.
Attr
<
bool
>
(
"exhaustive_search"
);
if
(
exhaustive_search
&&
FLAGS_cudnn_deterministic
)
{
PADDLE_THROW
(
"Cann't set exhaustive_search True and "
"FLAGS_cudnn_deterministic True at same time."
);
}
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor
input_desc
;
...
...
@@ -265,14 +345,66 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnnConvolutionBwdFilterAlgo_t
filter_algo
;
size_t
workspace_size_in_bytes
=
0
,
tmp_size
=
0
;
size_t
workspace_size_limit
=
kCONV_CUDNN_WORKSPACE_LIMIT_BYTES
;
if
(
user_workspace_size
>
0
)
{
workspace_size_limit
=
user_workspace_size
*
1024
*
1024
;
if
(
FLAGS_conv_workspace_size_limit
>
0
||
user_workspace_size
>
0
)
{
int64_t
max_user_size
=
std
::
max
(
static_cast
<
int64_t
>
(
FLAGS_conv_workspace_size_limit
),
user_workspace_size
);
workspace_size_limit
=
max_user_size
*
1024
*
1024
;
}
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
x_dims
=
framework
::
vectorize
(
input
->
dims
());
auto
f_dims
=
framework
::
vectorize
(
filter
->
dims
());
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
if
(
input_grad
)
{
if
(
!
FLAGS_cudnn_deterministic
)
{
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>*
data_algo_cache
;
if
(
ctx
.
scope
().
FindVar
(
kCUDNNBwdDataAlgoCache
))
{
data_algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNBwdDataAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
();
}
else
{
data_algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNBwdDataAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdDataAlgo_t
>>
();
}
data_algo
=
data_algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdDataAlgoPerf_t
,
kNUM_CUDNN_BWD_DATA_ALGS
>
data_perf_stat
;
auto
cudnn_find_bd_data_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionBackwardDataAlgorithmEx
(
handle
,
cudnn_filter_desc
,
filter_data
,
cudnn_output_grad_desc
,
output_grad_data
,
cudnn_conv_desc
,
cudnn_input_desc
,
input_grad_data
,
kNUM_CUDNN_BWD_DATA_ALGS
,
&
returned_algo_count
,
data_perf_stat
.
data
(),
cudnn_workspace
,
workspace_size_limit
));
};
workspace_handle
.
RunFunc
(
cudnn_find_bd_data_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
data_perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
algo
<<
": "
<<
stat
.
status
<<
" "
<<
stat
.
time
<<
" "
<<
stat
.
memory
;
}
return
data_perf_stat
[
0
].
algo
;
});
VLOG
(
3
)
<<
"cuDNN backward data algo "
<<
data_algo
;
}
else
if
(
FLAGS_cudnn_deterministic
)
{
data_algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
handle
,
cudnn_filter_desc
,
...
...
@@ -285,10 +417,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnn_input_desc
,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
data_algo
));
}
else
{
data_algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
handle
,
cudnn_filter_desc
,
cudnn_output_grad_desc
,
...
...
@@ -297,17 +426,54 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
}
if
(
filter_grad
)
{
if
(
!
FLAGS_cudnn_deterministic
)
{
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
exhaustive_search
)
{
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>*
f_algo_cache
;
if
(
ctx
.
scope
().
FindVar
(
kCUDNNBwdFilterAlgoCache
))
{
f_algo_cache
=
ctx
.
scope
()
.
FindVar
(
kCUDNNBwdFilterAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
();
}
else
{
f_algo_cache
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
.
Var
(
kCUDNNBwdFilterAlgoCache
)
->
GetMutable
<
AlgorithmsCache
<
cudnnConvolutionBwdFilterAlgo_t
>>
();
}
filter_algo
=
f_algo_cache
->
GetAlgorithm
(
x_dims
,
f_dims
,
strides
,
paddings
,
dilations
,
0
,
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
cudnnConvolutionBwdFilterAlgoPerf_t
,
kNUM_CUDNN_BWD_FILTER_ALGS
>
filter_perf_stat
;
auto
cudnn_find_bd_f_func
=
[
&
](
void
*
cudnn_workspace
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
handle
,
cudnn_input_desc
,
input_data
,
cudnn_output_grad_desc
,
output_grad_data
,
cudnn_conv_desc
,
cudnn_filter_desc
,
filter_grad_data
,
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
returned_algo_count
,
filter_perf_stat
.
data
(),
cudnn_workspace
,
workspace_size_limit
));
};
workspace_handle
.
RunFunc
(
cudnn_find_bd_f_func
,
workspace_size_limit
);
return
filter_perf_stat
[
0
].
algo
;
});
VLOG
(
3
)
<<
"cuDNN backward filter algo "
<<
filter_algo
;
}
else
if
(
FLAGS_cudnn_deterministic
)
{
filter_algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
}
else
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_output_grad_desc
,
cudnn_conv_desc
,
cudnn_filter_desc
,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
filter_algo
));
}
else
{
filter_algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
}
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
handle
,
cudnn_input_desc
,
cudnn_output_grad_desc
,
cudnn_conv_desc
,
...
...
@@ -317,7 +483,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv backward data ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
if
(
input_grad
)
{
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Because beta is zero, it is unnecessary to reset input_grad.
...
...
paddle/fluid/operators/conv_cudnn_op_cache.h
0 → 100644
浏览文件 @
d231e550
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <unordered_map>
#include <vector>
namespace
paddle
{
namespace
operators
{
template
<
typename
TAlgorithm
>
class
AlgorithmsCache
{
public:
// Caches the best algorithm for a given
// combination of tensor dimensions & compute data type.
TAlgorithm
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
// can set for different data type
std
::
function
<
TAlgorithm
()
>
gen_func
);
private:
std
::
unordered_map
<
int64_t
,
TAlgorithm
>
hash_
;
std
::
mutex
mutex_
;
};
template
<
typename
TAlgorithm
>
TAlgorithm
AlgorithmsCache
<
TAlgorithm
>::
GetAlgorithm
(
const
std
::
vector
<
int64_t
>&
dims1
,
const
std
::
vector
<
int64_t
>&
dims2
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
int
algorithmFlags
,
std
::
function
<
TAlgorithm
()
>
gen_func
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
int64_t
seed
=
0
;
// Hash all of the inputs, use to try and look up a previously
// discovered algorithm, or fall back to generating a new one.
std
::
hash
<
int64_t
>
hashFn
;
// do hash like boost
// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
for
(
const
auto
num
:
dims1
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
);
}
for
(
const
auto
num
:
dims2
)
{
seed
^=
hashFn
(
num
)
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
1
;
}
for
(
const
auto
num
:
strides
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
2
;
}
for
(
const
auto
num
:
paddings
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
3
;
}
for
(
const
auto
num
:
dilations
)
{
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
num
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
4
;
}
seed
^=
hashFn
(
static_cast
<
int64_t
>
(
algorithmFlags
))
+
0x9e3779b9
+
(
seed
<<
6
)
+
(
seed
>>
2
)
+
5
;
if
(
seed
==
0
)
return
gen_func
();
if
(
hash_
.
find
(
seed
)
==
hash_
.
end
())
{
TAlgorithm
value
=
gen_func
();
hash_
[
seed
]
=
value
;
}
return
hash_
[
seed
];
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/conv_op.cc
浏览文件 @
d231e550
...
...
@@ -189,6 +189,11 @@ void Conv2DOpMaker::Make() {
"workspace size can increase performance but also requires "
"better hardware. This size should be chosen carefully."
)
.
SetDefault
(
4096
);
AddAttr
<
bool
>
(
"exhaustive_search"
,
"(bool, default false) cuDNN has many algorithm to calculation "
"convolution, whether enable exhaustive search "
,
"for cuDNN convolution or not, defalut is False."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
Convolution Operator.
...
...
@@ -219,6 +224,15 @@ $$
)DOC"
);
}
class
ConvOpInferVarType
:
public
framework
::
PassInDtypeAndVarTypeToOutput
{
protected:
std
::
unordered_map
<
std
::
string
,
std
::
string
>
GetInputOutputWithSameType
()
const
override
{
return
std
::
unordered_map
<
std
::
string
,
std
::
string
>
{
{
"Input"
,
/*->*/
"Output"
}};
}
};
void
Conv3DOpMaker
::
Make
()
{
AddInput
(
"Input"
,
...
...
@@ -283,7 +297,11 @@ void Conv3DOpMaker::Make() {
"workspace size can increase performance but also requires "
"better hardware. This size should be chosen carefully."
)
.
SetDefault
(
4096
);
AddAttr
<
bool
>
(
"exhaustive_search"
,
"(bool, default false) cuDNN has many algorithm to calculation "
"convolution, whether enable exhaustive search "
,
"for cuDNN convolution or not, defalut is False."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
Convolution3D Operator.
...
...
@@ -356,6 +374,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
conv2d
,
ops
::
ConvOp
,
ops
::
Conv2DOpMaker
,
ops
::
ConvOpInferVarType
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
conv2d_grad
,
ops
::
ConvOpGrad
);
...
...
@@ -363,7 +382,9 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
REGISTER_OPERATOR
(
depthwise_conv2d
,
ops
::
ConvOp
,
ops
::
Conv2DOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
depthwise_conv2d_grad
,
ops
::
ConvOpGrad
);
REGISTER_OPERATOR
(
conv3d
,
ops
::
ConvOp
,
ops
::
Conv3DOpMaker
,
ops
::
ConvOpInferVarType
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
conv3d_grad
,
ops
::
ConvOpGrad
);
...
...
paddle/fluid/operators/cross_entropy_op.cc
浏览文件 @
d231e550
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cross_entropy_op.h"
#include <string>
namespace
paddle
{
namespace
operators
{
...
...
@@ -179,6 +180,15 @@ or not. But the output only shares the LoD information with input X.
)DOC"
);
}
};
class
CrossEntropyOpInferVarType
:
public
framework
::
PassInDtypeAndVarTypeToOutput
{
protected:
std
::
unordered_map
<
std
::
string
,
std
::
string
>
GetInputOutputWithSameType
()
const
override
{
return
std
::
unordered_map
<
std
::
string
,
std
::
string
>
{{
"X"
,
/*->*/
"Y"
}};
}
};
}
// namespace operators
}
// namespace paddle
...
...
@@ -186,6 +196,7 @@ namespace ops = paddle::operators;
using
CPUCtx
=
paddle
::
platform
::
CPUDeviceContext
;
REGISTER_OPERATOR
(
cross_entropy
,
ops
::
CrossEntropyOp
,
ops
::
CrossEntropyOpMaker
,
ops
::
CrossEntropyOpInferVarType
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
cross_entropy_grad
,
ops
::
CrossEntropyGradientOp
);
REGISTER_OP_CPU_KERNEL
(
cross_entropy
,
ops
::
CrossEntropyOpKernel
<
CPUCtx
,
float
>
,
...
...
paddle/fluid/operators/distributed/brpc_server.cc
浏览文件 @
d231e550
...
...
@@ -133,10 +133,10 @@ void AsyncBRPCServer::StartServer() {
void
AsyncBRPCServer
::
ShutDownImpl
()
{
server_
.
Stop
(
1000
);
}
void
AsyncBRPCServer
::
WaitServerReady
()
{
VLOG
(
3
)
<<
"AsyncGRPCServer is wait server ready"
;
VLOG
(
3
0
)
<<
"AsyncGRPCServer is wait server ready"
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
this
->
mutex_ready_
);
condition_ready_
.
wait
(
lock
,
[
=
]
{
return
this
->
ready_
==
1
;
});
VLOG
(
3
)
<<
"AsyncGRPCServer WaitSeverReady"
;
VLOG
(
3
0
)
<<
"AsyncGRPCServer WaitSeverReady"
;
}
};
// namespace distributed
...
...
paddle/fluid/operators/distributed/grpc_client.cc
浏览文件 @
d231e550
...
...
@@ -38,7 +38,7 @@ void GRPCClient::SendComplete() {
std
::
unique_lock
<
std
::
mutex
>
lk
(
completed_mutex_
);
if
(
!
completed_
)
{
for
(
auto
&
it
:
channels_
)
{
VLOG
(
3
)
<<
"send complete message to "
<<
it
.
first
;
VLOG
(
3
0
)
<<
"send complete message to "
<<
it
.
first
;
this
->
AsyncSendComplete
(
it
.
first
);
}
PADDLE_ENFORCE
(
this
->
Wait
(),
"internal grpc error"
);
...
...
@@ -81,7 +81,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
::
grpc
::
ByteBuffer
req
;
SerializeToByteBuffer
(
var_name_val
,
var
,
*
p_ctx
,
&
req
,
""
,
trainer_id_
);
VLOG
(
3
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
VLOG
(
3
0
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
// stub context
s
->
response_call_back_
=
nullptr
;
...
...
@@ -142,7 +142,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
::
grpc
::
ByteBuffer
buf
;
RequestToByteBuffer
<
sendrecv
::
VariableMessage
>
(
req
,
&
buf
);
VLOG
(
3
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
VLOG
(
3
0
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
// stub context
s
->
response_call_back_
=
ProcGetResponse
;
...
...
@@ -190,7 +190,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
::
grpc
::
ByteBuffer
req
;
SerializeToByteBuffer
(
in_var_name_val
,
var
,
*
p_ctx
,
&
req
,
out_var_name_val
);
VLOG
(
3
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
VLOG
(
3
0
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
// stub context
s
->
response_call_back_
=
ProcGetResponse
;
...
...
@@ -328,14 +328,14 @@ void GRPCClient::Proceed() {
void
*
tag
=
nullptr
;
bool
ok
=
false
;
VLOG
(
3
)
<<
"GRPCClient Proceed begin"
;
VLOG
(
3
0
)
<<
"GRPCClient Proceed begin"
;
while
(
!
stopped_
&&
cq_
.
Next
(
&
tag
,
&
ok
))
{
BaseProcessor
*
c
=
static_cast
<
BaseProcessor
*>
(
tag
);
GPR_ASSERT
(
ok
);
PADDLE_ENFORCE
(
c
);
if
(
c
->
status_
.
ok
())
{
VLOG
(
3
)
<<
c
->
GetVarHandlePtr
()
->
String
()
<<
" process"
;
VLOG
(
3
0
)
<<
c
->
GetVarHandlePtr
()
->
String
()
<<
" process"
;
c
->
Process
();
}
else
if
(
c
->
status_
.
error_code
()
==
grpc
::
StatusCode
::
DEADLINE_EXCEEDED
)
{
// FIXME(gongwb): parse error_details?
...
...
@@ -370,7 +370,7 @@ void GRPCClient::Proceed() {
sync_cond_
.
notify_all
();
}
}
VLOG
(
3
)
<<
"GRPCClient Proceed end"
;
VLOG
(
3
0
)
<<
"GRPCClient Proceed end"
;
}
std
::
shared_ptr
<
grpc
::
Channel
>
GRPCClient
::
GetChannel
(
const
std
::
string
&
ep
)
{
...
...
paddle/fluid/operators/distributed/grpc_server.cc
浏览文件 @
d231e550
...
...
@@ -98,7 +98,7 @@ class RequestSend final : public RequestBase {
void
Process
()
override
{
std
::
string
varname
=
GetReqName
();
VLOG
(
4
)
<<
"RequestSend var_name:"
<<
varname
;
VLOG
(
4
0
)
<<
"RequestSend var_name:"
<<
varname
;
auto
scope
=
request_
->
GetMutableLocalScope
();
auto
invar
=
request_
->
GetVar
();
...
...
@@ -135,7 +135,7 @@ class RequestGet final : public RequestBase {
// proc request.
std
::
string
varname
=
request_
.
varname
();
int
trainer_id
=
request_
.
trainer_id
();
VLOG
(
4
)
<<
"RequestGet "
<<
varname
;
VLOG
(
4
0
)
<<
"RequestGet "
<<
varname
;
auto
scope
=
request_handler_
->
scope
();
auto
invar
=
scope
->
FindVar
(
varname
);
...
...
@@ -182,8 +182,8 @@ class RequestPrefetch final : public RequestBase {
std
::
string
in_var_name
=
request_
->
Varname
();
std
::
string
out_var_name
=
request_
->
OutVarname
();
int
trainer_id
=
request_
->
GetTrainerId
();
VLOG
(
4
)
<<
"RequestPrefetch, in_var_name: "
<<
in_var_name
<<
" out_var_name: "
<<
out_var_name
;
VLOG
(
4
0
)
<<
"RequestPrefetch, in_var_name: "
<<
in_var_name
<<
" out_var_name: "
<<
out_var_name
;
auto
scope
=
request_
->
GetMutableLocalScope
();
auto
invar
=
scope
->
FindVar
(
in_var_name
);
...
...
@@ -231,8 +231,8 @@ class RequestCheckpointNotify final : public RequestBase {
std
::
string
checkpoint_dir
=
request_
->
OutVarname
();
int
trainer_id
=
request_
->
GetTrainerId
();
VLOG
(
4
)
<<
"RequestCheckpointNotify notify: "
<<
checkpoint_notify
<<
", dir: "
<<
checkpoint_dir
;
VLOG
(
4
0
)
<<
"RequestCheckpointNotify notify: "
<<
checkpoint_notify
<<
", dir: "
<<
checkpoint_dir
;
request_handler_
->
Handle
(
checkpoint_notify
,
scope
,
nullptr
,
nullptr
,
trainer_id
,
checkpoint_dir
);
...
...
@@ -246,10 +246,10 @@ class RequestCheckpointNotify final : public RequestBase {
};
void
AsyncGRPCServer
::
WaitServerReady
()
{
VLOG
(
4
)
<<
"AsyncGRPCServer is wait server ready"
;
VLOG
(
4
0
)
<<
"AsyncGRPCServer is wait server ready"
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
this
->
mutex_ready_
);
condition_ready_
.
wait
(
lock
,
[
=
]
{
return
this
->
ready_
==
1
;
});
VLOG
(
4
)
<<
"AsyncGRPCServer WaitSeverReady"
;
VLOG
(
4
0
)
<<
"AsyncGRPCServer WaitSeverReady"
;
}
void
AsyncGRPCServer
::
StartServer
()
{
...
...
@@ -282,14 +282,15 @@ void AsyncGRPCServer::StartServer() {
reqs
.
reserve
(
kRequestBufSize
);
for
(
int
i
=
0
;
i
<
kRequestBufSize
;
i
++
)
{
VLOG
(
6
)
<<
"TryToRegisterNewOne on RPC NAME: "
<<
rpc_name
<<
" I: "
<<
i
;
VLOG
(
60
)
<<
"TryToRegisterNewOne on RPC NAME: "
<<
rpc_name
<<
" I: "
<<
i
;
TryToRegisterNewOne
(
rpc_name
,
i
);
}
for
(
int
i
=
0
;
i
<
threadnum
;
i
++
)
{
rpc_threads_
[
rpc_name
].
emplace_back
(
new
std
::
thread
(
std
::
bind
(
&
AsyncGRPCServer
::
HandleRequest
,
this
,
cq
.
get
(),
rpc_name
,
f
)));
VLOG
(
4
)
<<
t
.
first
<<
" creates threads!"
;
VLOG
(
4
0
)
<<
t
.
first
<<
" creates threads!"
;
}
}
...
...
@@ -306,7 +307,7 @@ void AsyncGRPCServer::StartServer() {
auto
&
threads
=
t
.
second
;
for
(
size_t
i
=
0
;
i
<
threads
.
size
();
++
i
)
{
threads
[
i
]
->
join
();
VLOG
(
4
)
<<
t
.
first
<<
" threads ends!"
;
VLOG
(
4
0
)
<<
t
.
first
<<
" threads ends!"
;
}
}
}
...
...
@@ -314,7 +315,7 @@ void AsyncGRPCServer::StartServer() {
void
AsyncGRPCServer
::
ShutdownQueue
()
{
for
(
auto
&
t
:
rpc_cq_
)
{
t
.
second
->
Shutdown
();
VLOG
(
4
)
<<
t
.
first
<<
" queue shutdown!"
;
VLOG
(
4
0
)
<<
t
.
first
<<
" queue shutdown!"
;
}
}
...
...
@@ -323,7 +324,7 @@ void AsyncGRPCServer::ShutDownImpl() {
is_shut_down_
=
true
;
ShutdownQueue
();
VLOG
(
4
)
<<
"server_ shutdown!"
;
VLOG
(
4
0
)
<<
"server_ shutdown!"
;
server_
->
Shutdown
();
}
...
...
@@ -331,12 +332,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
int
req_id
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
cq_mutex_
);
if
(
is_shut_down_
)
{
VLOG
(
4
)
<<
"shutdown, do not TryToRegisterNewSendOne"
;
VLOG
(
4
0
)
<<
"shutdown, do not TryToRegisterNewSendOne"
;
return
;
}
VLOG
(
4
)
<<
"TryToRegisterNewOne on RPC NAME: "
<<
rpc_name
<<
" REQ ID: "
<<
req_id
;
VLOG
(
4
0
)
<<
"TryToRegisterNewOne on RPC NAME: "
<<
rpc_name
<<
" REQ ID: "
<<
req_id
;
auto
&
reqs
=
rpc_reqs_
[
rpc_name
];
auto
&
handler
=
rpc_call_map_
[
rpc_name
];
...
...
@@ -357,7 +358,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
reqs
[
req_id
]
=
b
;
VLOG
(
4
)
<<
"Create RequestSend status:"
<<
b
->
Status
();
VLOG
(
4
0
)
<<
"Create RequestSend status:"
<<
b
->
Status
();
}
void
AsyncGRPCServer
::
HandleRequest
(
...
...
@@ -367,15 +368,15 @@ void AsyncGRPCServer::HandleRequest(
bool
ok
=
false
;
while
(
true
)
{
VLOG
(
4
)
<<
"HandleRequest "
<<
rpc_name
<<
" wait next"
;
VLOG
(
4
0
)
<<
"HandleRequest "
<<
rpc_name
<<
" wait next"
;
if
(
!
cq
->
Next
(
&
tag
,
&
ok
))
{
VLOG
(
3
)
<<
"CompletionQueue "
<<
rpc_name
<<
" shutdown!"
;
VLOG
(
3
0
)
<<
"CompletionQueue "
<<
rpc_name
<<
" shutdown!"
;
break
;
}
int
req_id
=
static_cast
<
int
>
(
reinterpret_cast
<
intptr_t
>
(
tag
));
VLOG
(
4
)
<<
"HandleRequest "
<<
rpc_name
<<
", req_id:"
<<
req_id
<<
" get next"
;
VLOG
(
4
0
)
<<
"HandleRequest "
<<
rpc_name
<<
", req_id:"
<<
req_id
<<
" get next"
;
auto
&
reqs
=
rpc_reqs_
[
rpc_name
];
RequestBase
*
base
=
nullptr
;
...
...
@@ -385,7 +386,7 @@ void AsyncGRPCServer::HandleRequest(
base
=
reqs
[
req_id
];
}
VLOG
(
3
)
<<
base
->
Status2String
(
rpc_name
);
VLOG
(
3
0
)
<<
base
->
Status2String
(
rpc_name
);
// reference:
// https://github.com/tensorflow/tensorflow/issues/5596
...
...
paddle/fluid/operators/distributed/request_handler.h
浏览文件 @
d231e550
...
...
@@ -75,7 +75,7 @@ class VarHandle {
wait_cond_
.
wait
(
lk
,
[
this
]
{
return
status_
!=
kDefaultState
;
});
ret
=
status_
;
}
VLOG
(
7
)
<<
"VarHandle wait:"
<<
ret
;
VLOG
(
7
0
)
<<
"VarHandle wait:"
<<
ret
;
return
ret
!=
kErrorState
;
}
...
...
@@ -84,7 +84,7 @@ class VarHandle {
std
::
unique_lock
<
std
::
mutex
>
lk
(
sync_mutex_
);
status_
=
ok
?
kFinishState
:
kErrorState
;
}
VLOG
(
7
)
<<
"VarHandle finish:"
<<
ok
;
VLOG
(
7
0
)
<<
"VarHandle finish:"
<<
ok
;
wait_cond_
.
notify_all
();
}
...
...
paddle/fluid/operators/distributed/request_handler_impl.cc
浏览文件 @
d231e550
...
...
@@ -38,19 +38,19 @@ bool RequestSendHandler::Handle(const std::string& varname,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
)
{
VLOG
(
4
)
<<
"RequestSendHandler:"
<<
varname
;
VLOG
(
4
0
)
<<
"RequestSendHandler:"
<<
varname
;
// Sync
if
(
varname
==
BATCH_BARRIER_MESSAGE
)
{
VLOG
(
3
)
<<
"sync: recv BATCH_BARRIER_MESSAGE"
;
VLOG
(
3
0
)
<<
"sync: recv BATCH_BARRIER_MESSAGE"
;
rpc_server_
->
IncreaseBatchBarrier
(
kRequestSend
);
}
else
if
(
varname
==
COMPLETE_MESSAGE
)
{
VLOG
(
3
)
<<
"sync: recv complete message"
;
VLOG
(
3
0
)
<<
"sync: recv complete message"
;
rpc_server_
->
Complete
();
}
else
{
// Async
if
(
!
sync_mode_
)
{
VLOG
(
3
)
<<
"async process var: "
<<
varname
;
VLOG
(
3
0
)
<<
"async process var: "
<<
varname
;
try
{
executor_
->
RunPreparedContext
((
*
grad_to_prepared_ctx_
)[
varname
].
get
(),
scope
);
...
...
@@ -61,7 +61,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
return
true
;
}
else
{
// sync
rpc_server_
->
WaitCond
(
kRequestSend
);
VLOG
(
3
)
<<
"sync: processing received var: "
<<
varname
;
VLOG
(
3
0
)
<<
"sync: processing received var: "
<<
varname
;
if
(
invar
==
nullptr
)
{
LOG
(
FATAL
)
<<
"sync: Can not find server side var: "
<<
varname
;
...
...
@@ -78,10 +78,10 @@ bool RequestGetHandler::Handle(const std::string& varname,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
)
{
VLOG
(
4
)
<<
"RequestGetHandler:"
<<
varname
;
VLOG
(
4
0
)
<<
"RequestGetHandler:"
<<
varname
;
if
(
sync_mode_
)
{
if
(
varname
==
FETCH_BARRIER_MESSAGE
)
{
VLOG
(
3
)
<<
"sync: recv fetch barrier message"
;
VLOG
(
3
0
)
<<
"sync: recv fetch barrier message"
;
rpc_server_
->
IncreaseBatchBarrier
(
kRequestGet
);
}
else
{
rpc_server_
->
WaitCond
(
kRequestGet
);
...
...
@@ -93,13 +93,14 @@ bool RequestGetHandler::Handle(const std::string& varname,
// NOTE: the format is determined by distributed_transpiler.py
std
::
string
param_bak_name
=
string
::
Sprintf
(
"%s.trainer_%d_bak"
,
varname
,
trainer_id
);
VLOG
(
3
)
<<
"getting "
<<
param_bak_name
<<
" trainer_id "
<<
trainer_id
;
VLOG
(
30
)
<<
"getting "
<<
param_bak_name
<<
" trainer_id "
<<
trainer_id
;
auto
var
=
scope_
->
FindVar
(
varname
);
auto
t_orig
=
var
->
Get
<
framework
::
LoDTensor
>
();
auto
param_bak
=
scope_
->
Var
(
param_bak_name
);
auto
t
=
param_bak
->
GetMutable
<
framework
::
LoDTensor
>
();
t
->
mutable_data
(
dev_ctx_
->
GetPlace
(),
t_orig
.
type
());
VLOG
(
3
)
<<
"copying "
<<
varname
<<
" to "
<<
param_bak_name
;
VLOG
(
3
0
)
<<
"copying "
<<
varname
<<
" to "
<<
param_bak_name
;
framework
::
TensorCopy
(
t_orig
,
dev_ctx_
->
GetPlace
(),
t
);
}
*
outvar
=
scope_
->
FindVar
(
varname
);
...
...
@@ -114,7 +115,7 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
framework
::
Variable
**
outvar
,
const
int
trainer_id
,
const
std
::
string
&
out_var_name
)
{
VLOG
(
4
)
<<
"RequestPrefetchHandler "
<<
varname
;
VLOG
(
4
0
)
<<
"RequestPrefetchHandler "
<<
varname
;
auto
var_desc
=
program_
->
Block
(
0
).
FindVar
(
out_var_name
);
InitializeVariable
(
*
outvar
,
var_desc
->
GetType
());
...
...
@@ -138,8 +139,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
auto
*
lt_var
=
scope_
->
FindVar
(
LOOKUP_TABLE_PATH
)
->
GetMutable
<
std
::
string
>
();
lt_var
->
clear
();
lt_var
->
append
(
out_var_name
);
VLOG
(
4
)
<<
"RequestCheckpointHandler update var kLookupTablePath to: "
<<
out_var_name
;
VLOG
(
4
0
)
<<
"RequestCheckpointHandler update var kLookupTablePath to: "
<<
out_var_name
;
executor_
->
RunPreparedContext
(
checkpoint_prepared_ctx_
.
get
(),
scope_
);
return
true
;
}
...
...
paddle/fluid/operators/distributed/rpc_server.cc
浏览文件 @
d231e550
...
...
@@ -39,7 +39,7 @@ void RPCServer::SavePort() const {
port_file
.
open
(
file_path
);
port_file
<<
selected_port_
;
port_file
.
close
();
VLOG
(
4
)
<<
"selected port written to "
<<
file_path
;
VLOG
(
4
0
)
<<
"selected port written to "
<<
file_path
;
}
void
RPCServer
::
WaitBarrier
(
const
std
::
string
&
rpc_name
)
{
...
...
@@ -49,12 +49,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
exit_flag_
.
load
());
});
VLOG
(
3
)
<<
"batch_barrier_: "
<<
rpc_name
<<
" "
<<
barrier_counter_
[
rpc_name
];
VLOG
(
3
0
)
<<
"batch_barrier_: "
<<
rpc_name
<<
" "
<<
barrier_counter_
[
rpc_name
];
}
void
RPCServer
::
IncreaseBatchBarrier
(
const
std
::
string
rpc_name
)
{
VLOG
(
4
)
<<
"RPCServer begin IncreaseBatchBarrier "
<<
rpc_name
;
VLOG
(
4
0
)
<<
"RPCServer begin IncreaseBatchBarrier "
<<
rpc_name
;
int
b
=
0
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
b
=
++
barrier_counter_
[
rpc_name
];
...
...
@@ -71,7 +71,7 @@ void RPCServer::Complete() {
client_num_
--
;
need_reset_all_vars_
=
true
;
VLOG
(
4
)
<<
"decrease client_num to: "
<<
client_num_
;
VLOG
(
4
0
)
<<
"decrease client_num to: "
<<
client_num_
;
if
(
cur_cond_
.
load
()
==
rpc_cond_map_
[
kRequestGet
])
{
barrier_counter_
[
kRequestGet
]
--
;
}
...
...
@@ -90,7 +90,7 @@ int RPCServer::GetClientNum() {
}
void
RPCServer
::
ResetBarrierCounter
()
{
VLOG
(
3
)
<<
"RPCServer ResetBarrierCounter "
;
VLOG
(
3
0
)
<<
"RPCServer ResetBarrierCounter "
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
for
(
auto
&
t
:
barrier_counter_
)
{
t
.
second
=
0
;
...
...
@@ -105,12 +105,12 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,
static
int
cond
=
-
1
;
rpc_cond_map_
[
rpc_name
]
=
++
cond
;
VLOG
(
4
)
<<
"RegisterRPC rpc_name:"
<<
rpc_name
<<
", handler:"
<<
handler
<<
", cond:"
<<
rpc_cond_map_
[
rpc_name
];
VLOG
(
4
0
)
<<
"RegisterRPC rpc_name:"
<<
rpc_name
<<
", handler:"
<<
handler
<<
", cond:"
<<
rpc_cond_map_
[
rpc_name
];
}
void
RPCServer
::
SetCond
(
const
std
::
string
&
rpc_name
)
{
VLOG
(
3
)
<<
"RPCServer SetCond "
<<
rpc_name
;
VLOG
(
3
0
)
<<
"RPCServer SetCond "
<<
rpc_name
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
cur_cond_
=
rpc_cond_map_
[
rpc_name
];
...
...
@@ -120,7 +120,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
}
void
RPCServer
::
WaitCond
(
const
std
::
string
&
rpc_name
)
{
VLOG
(
4
)
<<
"RPCServer WaitCond "
<<
rpc_name
;
VLOG
(
4
0
)
<<
"RPCServer WaitCond "
<<
rpc_name
;
int
cond
=
0
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
...
...
paddle/fluid/operators/distributed/variable_response.cc
浏览文件 @
d231e550
...
...
@@ -50,7 +50,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
size_to_write
=
length
-
total_written
;
}
// This log is useful to see how long a internal block size is of rpc.
VLOG
(
7
)
<<
"copy "
<<
size_to_write
<<
" data to CUDAPlace"
;
VLOG
(
7
0
)
<<
"copy "
<<
size_to_write
<<
" data to CUDAPlace"
;
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place
),
reinterpret_cast
<
void
*>
(
p
),
cpu
,
data
,
size_to_write
,
gpu_dev_ctx
.
stream
());
...
...
@@ -79,7 +79,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
// TODO(gongwb): can we avoid copy?
platform
::
CPUPlace
cpu
;
// This log is useful to see how long a internal block size is of rpc.
VLOG
(
7
)
<<
"copy "
<<
size_to_write
<<
" data to CPUPlace"
;
VLOG
(
7
0
)
<<
"copy "
<<
size_to_write
<<
" data to CPUPlace"
;
memory
::
Copy
(
cpu
,
reinterpret_cast
<
void
*>
(
p
),
cpu
,
data
,
size_to_write
);
p
+=
size_to_write
;
...
...
@@ -198,8 +198,8 @@ bool VariableResponse::ProcSerializedField(
#endif
}
VLOG
(
7
)
<<
"ProcSerializedField:"
<<
meta_
.
varname
()
<<
", type:"
<<
meta_
.
type
()
<<
std
::
endl
;
VLOG
(
7
0
)
<<
"ProcSerializedField:"
<<
meta_
.
varname
()
<<
", type:"
<<
meta_
.
type
()
<<
std
::
endl
;
framework
::
DDim
dims
=
GetDims
(
meta_
.
dims
());
if
(
meta_
.
type
()
==
sendrecv
::
LOD_TENSOR
)
{
PADDLE_ENFORCE
(
meta_
.
lod_size
()
>=
0
,
"lod info should be got first!"
);
...
...
paddle/fluid/operators/elementwise_op.h
浏览文件 @
d231e550
...
...
@@ -75,16 +75,12 @@ class ElementwiseOp : public framework::OperatorWithKernel {
}
};
class
ElementwiseOpInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
x_name
=
op_desc
.
Input
(
"X"
)[
0
];
auto
out_name
=
op_desc
.
Output
(
"Out"
)[
0
];
auto
&
x
=
block
->
FindRecursiveOrCreateVar
(
x_name
);
auto
&
out
=
block
->
FindRecursiveOrCreateVar
(
out_name
);
out
.
SetType
(
x
.
GetType
());
out
.
SetDataType
(
x
.
GetDataType
());
class
ElementwiseOpInferVarType
:
public
framework
::
PassInDtypeAndVarTypeToOutput
{
protected:
std
::
unordered_map
<
std
::
string
,
std
::
string
>
GetInputOutputWithSameType
()
const
override
{
return
std
::
unordered_map
<
std
::
string
,
std
::
string
>
{{
"X"
,
/*->*/
"Out"
}};
}
};
...
...
paddle/fluid/operators/feed_op.cc
浏览文件 @
d231e550
...
...
@@ -47,8 +47,8 @@ class FeedOp : public framework::OperatorBase {
auto
col
=
Attr
<
int
>
(
"col"
);
VLOG
(
3
)
<<
"Feed Var "
<<
feed_var_name
<<
"'s "
<<
col
<<
" column to var "
<<
out_name
;
VLOG
(
3
0
)
<<
"Feed Var "
<<
feed_var_name
<<
"'s "
<<
col
<<
" column to var "
<<
out_name
;
auto
&
feed_list
=
feed_var
->
Get
<
framework
::
FeedFetchList
>
();
auto
&
feed_item
=
feed_list
.
at
(
static_cast
<
size_t
>
(
col
));
...
...
paddle/fluid/operators/fetch_barrier_op.cc
浏览文件 @
d231e550
...
...
@@ -43,7 +43,7 @@ class FetchBarrierOp : public framework::OperatorBase {
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
for
(
auto
&
ep
:
eps
)
{
VLOG
(
3
)
<<
"fetch barrier, ep: "
<<
ep
;
VLOG
(
3
0
)
<<
"fetch barrier, ep: "
<<
ep
;
rpc_client
->
AsyncSendFetchBarrier
(
ep
);
}
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
...
...
paddle/fluid/operators/fetch_op.cc
浏览文件 @
d231e550
...
...
@@ -57,7 +57,7 @@ class FetchOp : public framework::OperatorBase {
TensorCopySync
(
src_item
,
platform
::
CPUPlace
(),
&
dst_item
);
dst_item
.
set_lod
(
src_item
.
lod
());
VLOG
(
3
)
<<
"Fetch variable "
<<
fetch_var_name
<<
" to "
<<
out_name
;
VLOG
(
3
0
)
<<
"Fetch variable "
<<
fetch_var_name
<<
" to "
<<
out_name
;
}
};
...
...
paddle/fluid/operators/gen_nccl_id_op.cc
浏览文件 @
d231e550
...
...
@@ -64,7 +64,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
(
0
);
for
(
auto
&
ep
:
endpoint_list
)
{
VLOG
(
3
)
<<
"sending nccl id to "
<<
ep
;
VLOG
(
3
0
)
<<
"sending nccl id to "
<<
ep
;
client
->
AsyncSendVar
(
ep
,
dev_ctx
,
*
scope
,
NCCL_ID_VARNAME
);
}
client
->
Wait
();
...
...
@@ -72,7 +72,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
client
->
AsyncSendBatchBarrier
(
ep
);
}
client
->
Wait
();
VLOG
(
3
)
<<
"sending completed..."
;
VLOG
(
3
0
)
<<
"sending completed..."
;
}
void
GetIdByServer
(
framework
::
Scope
*
scope
,
...
...
@@ -99,11 +99,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
std
::
bind
(
&
distributed
::
RPCServer
::
StartServer
,
rpc_service
.
get
()));
rpc_service
->
SetCond
(
distributed
::
kRequestSend
);
VLOG
(
3
)
<<
"start getting nccl id from trainer 0..."
;
VLOG
(
3
0
)
<<
"start getting nccl id from trainer 0..."
;
rpc_service
->
WaitBarrier
(
distributed
::
kRequestSend
);
VLOG
(
3
)
<<
"got nccl id and stop server..."
;
VLOG
(
3
0
)
<<
"got nccl id and stop server..."
;
rpc_service
->
ShutDown
();
VLOG
(
3
)
<<
"rpc server stopped"
;
VLOG
(
3
0
)
<<
"rpc server stopped"
;
server_thread
.
join
();
}
};
...
...
paddle/fluid/operators/grid_sampler_op.h
浏览文件 @
d231e550
...
...
@@ -63,12 +63,19 @@ static void CalcGridLocations(const platform::CPUDeviceContext& ctx,
Tensor
ones
;
ones
.
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
auto
ones_t
=
EigenTensor
<
T
,
3
>::
From
(
ones
).
setConstant
(
1.0
);
Tensor
half_xmax
,
half_ymax
;
half_xmax
.
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
auto
half_xmax_t
=
EigenTensor
<
T
,
3
>::
From
(
half_xmax
).
setConstant
(
0.5
*
x_max
);
half_ymax
.
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
auto
half_ymax_t
=
EigenTensor
<
T
,
3
>::
From
(
half_ymax
).
setConstant
(
0.5
*
y_max
);
// scale grid to [0, h-1/w-1]
auto
grid_x_t
=
EigenTensor
<
T
,
3
>::
From
(
grid_x
);
auto
grid_y_t
=
EigenTensor
<
T
,
3
>::
From
(
grid_y
);
grid_x_t
.
device
(
place
)
=
0.5
*
((
grid_x_t
+
ones_t
)
*
x_max
)
;
grid_y_t
.
device
(
place
)
=
0.5
*
((
grid_y_t
+
ones_t
)
*
y_max
)
;
grid_x_t
.
device
(
place
)
=
(
grid_x_t
+
ones_t
)
*
half_xmax_t
;
grid_y_t
.
device
(
place
)
=
(
grid_y_t
+
ones_t
)
*
half_ymax_t
;
// calculate coords of 4 corner points
x_w
->
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
...
...
paddle/fluid/operators/
bilinear_interp
_op.cc
→
paddle/fluid/operators/
interpolate
_op.cc
浏览文件 @
d231e550
/* Copyright (c) 201
6
PaddlePaddle Authors. All Rights Reserve.
/* Copyright (c) 201
8
PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
...
...
@@ -9,7 +9,8 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/bilinear_interp_op.h"
#include "paddle/fluid/operators/interpolate_op.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
...
...
@@ -18,27 +19,34 @@ namespace operators {
using
framework
::
Tensor
;
class
BilinearInterp
Op
:
public
framework
::
OperatorWithKernel
{
class
Interpolate
Op
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
protected:
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of
BilinearInter
Op should not be null."
);
"Input(X) of
Interpolate
Op should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of BilinearInterOp should not be null."
);
"Output(Out) of InterpolationOp should not be null."
);
auto
interp_method
=
ctx
->
Attrs
().
Get
<
std
::
string
>
(
"interp_method"
);
PADDLE_ENFORCE
(
"bilinear"
==
interp_method
||
"nearest"
==
interp_method
,
"Interpolation method can only be
\"
bilinear
\"
or
\"
nearest
\"
."
);
auto
dim_x
=
ctx
->
GetInputDim
(
"X"
);
// NCHW format
int
out_h
=
ctx
->
Attrs
().
Get
<
int
>
(
"out_h"
);
int
out_w
=
ctx
->
Attrs
().
Get
<
int
>
(
"out_w"
);
PADDLE_ENFORCE_EQ
(
dim_x
.
size
(),
4
,
"X's dimension must be 4"
);
if
(
ctx
->
HasInput
(
"OutSize"
))
{
if
(
ctx
->
HasInput
(
"OutSize"
)
&&
ctx
->
IsRuntime
()
)
{
auto
out_size_dim
=
ctx
->
GetInputDim
(
"OutSize"
);
PADDLE_ENFORCE_EQ
(
out_size_dim
.
size
(),
1
,
"OutSize's dimension size must be 1"
);
PADDLE_ENFORCE_EQ
(
out_size_dim
[
0
],
2
,
"OutSize's dim[0] must be 2"
);
ctx
->
ShareLoD
(
"X"
,
"Out"
);
return
;
}
std
::
vector
<
int64_t
>
dim_out
({
dim_x
[
0
],
dim_x
[
1
],
out_h
,
out_w
});
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
dim_out
));
...
...
@@ -52,35 +60,53 @@ class BilinearInterpOp : public framework::OperatorWithKernel {
}
};
class
BilinearInterp
OpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
class
Interpolate
OpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"The input tensor of
bilinear interpolation
, "
"This is a 4-D tensor with shape of
(N x C x h x w)
"
);
"The input tensor of
interpolate operator
, "
"This is a 4-D tensor with shape of
[N, C, H, w].
"
);
AddInput
(
"OutSize"
,
"This is a 1-D tensor with two number. "
"This is a 1-D tensor with two number
s to specify output size
. "
"The first number is height and the second number is width."
)
.
AsDispensable
();
AddOutput
(
"Out"
,
"The dimension of output is (N x C x out_h x out_w)"
);
AddOutput
(
"Out"
,
"The output tensor of interpolate operator, "
"This is a 4-D tensor with shape of [N, C, H, W]."
);
AddAttr
<
int
>
(
"out_h"
,
"output height of bilinear interpolation op."
);
AddAttr
<
int
>
(
"out_w"
,
"output width of bilinear interpolation op."
);
AddAttr
<
int
>
(
"out_h"
,
"output height of interpolate op."
);
AddAttr
<
int
>
(
"out_w"
,
"output width of interpolate op."
);
AddAttr
<
std
::
string
>
(
"interp_method"
,
"(string), interpolation method, can be
\"
bilinear
\"
for "
"bilinear interpolation and
\"
nearest
\"
for nearest "
"neighbor interpolation."
);
AddComment
(
R"DOC(
This operator samples input X to given output shape by using specified
interpolation method, the interpolation methods can be \"nearest\"
for nearest neighbor interpolation and \"bilinear\" for bilinear
interpolation.
Nearest neighbor interpolation is to perform nearest neighbor interpolation
in both the 3rd dimention(in height direction) and the 4th dimention(in width
direction) on input tensor.
Bilinear interpolation is an extension of linear interpolation for
interpolating functions of two variables (e.g. H-direction and
W-direction in this op) on a rectilinear 2D grid.
The key idea is to perform linear interpolation first in one
direction, and then again in the other direction.
For details, please refer to Wikipedia:
W-direction in this op) on a rectilinear 2D grid. The key idea is
to perform linear interpolation first in one direction, and then
again in the other direction.
For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
For details of bilinear interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Bilinear_interpolation
)DOC"
);
}
};
class
BilinearInterp
OpGrad
:
public
framework
::
OperatorWithKernel
{
class
Interpolate
OpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
...
...
@@ -106,11 +132,11 @@ class BilinearInterpOpGrad : public framework::OperatorWithKernel {
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
bilinear_interp
,
ops
::
BilinearInterpOp
,
ops
::
BilinearInterpOpMaker
,
REGISTER_OPERATOR
(
interpolate
,
ops
::
InterpolateOp
,
ops
::
InterpolateOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
bilinear_interp_grad
,
ops
::
BilinearInterpOpGrad
);
REGISTER_OP_CPU_KERNEL
(
bilinear_interp
,
ops
::
BilinearInterpKernel
<
float
>
,
ops
::
BilinearInterpKernel
<
uint8_t
>
);
REGISTER_OP_CPU_KERNEL
(
bilinear_interp_grad
,
ops
::
BilinearInterpGradKernel
<
float
>
);
REGISTER_OPERATOR
(
interpolate_grad
,
ops
::
InterpolateOpGrad
);
REGISTER_OP_CPU_KERNEL
(
interpolate
,
ops
::
InterpolateKernel
<
float
>
,
ops
::
InterpolateKernel
<
double
>
,
ops
::
InterpolateKernel
<
uint8_t
>
);
REGISTER_OP_CPU_KERNEL
(
interpolate_grad
,
ops
::
InterpolateGradKernel
<
float
>
,
ops
::
InterpolateGradKernel
<
double
>
);
paddle/fluid/operators/
bilinear_interp
_op.cu
→
paddle/fluid/operators/
interpolate
_op.cu
浏览文件 @
d231e550
/* Copyright (c) 201
6
PaddlePaddle Authors. All Rights Reserve.
/* Copyright (c) 201
8
PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
...
...
@@ -9,7 +9,8 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/bilinear_interp_op.h"
#include <string>
#include "paddle/fluid/operators/interpolate_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace
paddle
{
...
...
@@ -17,15 +18,72 @@ namespace operators {
using
framework
::
Tensor
;
template
<
typename
T
>
__global__
void
KeNearestNeighborInterpFw
(
const
T
*
in
,
const
size_t
in_img_h
,
const
size_t
in_img_w
,
const
size_t
input_h
,
const
size_t
input_w
,
T
*
out
,
const
size_t
out_img_h
,
const
size_t
out_img_w
,
const
size_t
output_h
,
const
size_t
output_w
,
const
size_t
num_channels
,
const
float
ratio_h
,
const
float
ratio_w
)
{
int
nthreads
=
output_h
*
output_w
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
for
(;
tid
<
nthreads
;
tid
+=
stride
)
{
int
out_id_h
=
tid
/
output_w
;
int
out_id_w
=
tid
%
output_w
;
int
in_img_size
=
input_w
/
num_channels
;
int
out_img_size
=
output_w
/
num_channels
;
int
channel_id
=
out_id_w
/
out_img_size
;
int
out_img_idy
=
(
out_id_w
%
out_img_size
)
/
out_img_w
;
int
in_img_idy
=
static_cast
<
int
>
(
ratio_h
*
out_img_idy
+
0.5
);
int
out_img_idx
=
tid
%
out_img_w
;
int
in_img_idx
=
static_cast
<
int
>
(
ratio_w
*
out_img_idx
+
0.5
);
out
[
tid
]
=
in
[
out_id_h
*
input_w
+
channel_id
*
in_img_size
+
in_img_idy
*
in_img_w
+
in_img_idx
];
}
}
template
<
typename
T
>
__global__
void
KeNearestNeighborInterpBw
(
T
*
in
,
const
size_t
in_img_h
,
const
size_t
in_img_w
,
const
size_t
input_h
,
const
size_t
input_w
,
const
T
*
out
,
const
size_t
out_img_h
,
const
size_t
out_img_w
,
const
size_t
output_h
,
const
size_t
output_w
,
const
size_t
num_channels
,
const
float
ratio_h
,
const
float
ratio_w
)
{
int
nthreads
=
output_h
*
output_w
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
for
(;
tid
<
nthreads
;
tid
+=
stride
)
{
int
out_id_h
=
tid
/
output_w
;
int
out_id_w
=
tid
%
output_w
;
int
in_img_size
=
input_w
/
num_channels
;
int
out_img_size
=
output_w
/
num_channels
;
int
channel_id
=
out_id_w
/
out_img_size
;
int
out_img_idy
=
(
out_id_w
%
out_img_size
)
/
out_img_w
;
int
in_img_idy
=
static_cast
<
int
>
(
ratio_h
*
out_img_idy
+
0.5
);
int
out_img_idx
=
tid
%
out_img_w
;
int
in_img_idx
=
static_cast
<
int
>
(
ratio_w
*
out_img_idx
+
0.5
);
T
*
in_pos
=
&
in
[
out_id_h
*
input_w
+
channel_id
*
in_img_size
+
in_img_idy
*
in_img_w
+
in_img_idx
];
const
T
out_pos
=
out
[
out_id_h
*
output_w
+
out_id_w
];
platform
::
CudaAtomicAdd
(
in_pos
,
out_pos
);
}
}
template
<
typename
T
>
__global__
void
KeBilinearInterpFw
(
const
T
*
in
,
const
size_t
in_img_h
,
const
size_t
in_img_w
,
const
size_t
input_h
,
const
size_t
input_w
,
T
*
out
,
const
size_t
out_img_h
,
const
size_t
out_img_w
,
const
size_t
output_h
,
const
size_t
output_w
,
const
size_t
num_channels
,
const
T
ratio_h
,
const
T
ratioW
)
{
const
size_t
num_channels
,
const
float
ratio_h
,
const
float
ratio_w
)
{
int
nthreads
=
output_h
*
output_w
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
tid
<
nthreads
)
{
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
for
(;
tid
<
nthreads
;
tid
+=
stride
)
{
int
out_id_h
=
tid
/
output_w
;
int
out_id_w
=
tid
%
output_w
;
int
in_img_size
=
input_w
/
num_channels
;
...
...
@@ -39,9 +97,9 @@ __global__ void KeBilinearInterpFw(
T
h2lambda
=
1.
f
-
h1lambda
;
int
out_img_idx
=
tid
%
out_img_w
;
int
in_img_idx
=
ratio
W
*
out_img_idx
;
int
in_img_idx
=
ratio
_w
*
out_img_idx
;
int
w_id
=
(
in_img_idx
<
in_img_w
-
1
)
?
1
:
0
;
T
w1lambda
=
ratio
W
*
out_img_idx
-
in_img_idx
;
T
w1lambda
=
ratio
_w
*
out_img_idx
-
in_img_idx
;
T
w2lambda
=
1.
f
-
w1lambda
;
const
T
*
in_pos
=
&
in
[
out_id_h
*
input_w
+
channel_id
*
in_img_size
+
...
...
@@ -60,10 +118,11 @@ __global__ void KeBilinearInterpBw(
T
*
in
,
const
size_t
in_img_h
,
const
size_t
in_img_w
,
const
size_t
input_h
,
const
size_t
input_w
,
const
T
*
out
,
const
size_t
out_img_h
,
const
size_t
out_img_w
,
const
size_t
output_h
,
const
size_t
output_w
,
const
size_t
num_channels
,
const
T
ratio_h
,
const
T
ratio
W
)
{
const
size_t
num_channels
,
const
T
ratio_h
,
const
T
ratio
_w
)
{
int
nthreads
=
output_h
*
output_w
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
tid
<
nthreads
)
{
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
for
(;
tid
<
nthreads
;
tid
+=
stride
)
{
int
out_id_h
=
tid
/
output_w
;
int
out_id_w
=
tid
%
output_w
;
int
in_img_size
=
input_w
/
num_channels
;
...
...
@@ -77,122 +136,146 @@ __global__ void KeBilinearInterpBw(
T
h2lambda
=
1.
f
-
h1lambda
;
int
out_img_idx
=
tid
%
out_img_w
;
int
in_img_idx
=
ratio
W
*
out_img_idx
;
int
in_img_idx
=
ratio
_w
*
out_img_idx
;
int
w_id
=
(
in_img_idx
<
in_img_w
-
1
)
?
1
:
0
;
T
w1lambda
=
ratio
W
*
out_img_idx
-
in_img_idx
;
T
w1lambda
=
ratio
_w
*
out_img_idx
-
in_img_idx
;
T
w2lambda
=
1.
f
-
w1lambda
;
T
*
in_pos
=
&
in
[
out_id_h
*
input_w
+
channel_id
*
in_img_size
+
in_img_idy
*
in_img_w
+
in_img_idx
];
const
T
*
out_pos
=
&
out
[
out_id_h
*
output_w
+
out_id_w
];
atomicAdd
(
&
in_pos
[
0
],
h2lambda
*
w2lambda
*
out_pos
[
0
]);
atomicAdd
(
&
in_pos
[
w_id
],
h2lambda
*
w1lambda
*
out_pos
[
0
]);
atomicAdd
(
&
in_pos
[
h_id
*
in_img_w
],
h1lambda
*
w2lambda
*
out_pos
[
0
]);
atomicAdd
(
&
in_pos
[
h_id
*
in_img_w
+
w_id
],
h1lambda
*
w1lambda
*
out_pos
[
0
]);
platform
::
CudaAtomicAdd
(
&
in_pos
[
0
],
h2lambda
*
w2lambda
*
out_pos
[
0
]);
platform
::
CudaAtomicAdd
(
&
in_pos
[
w_id
],
h2lambda
*
w1lambda
*
out_pos
[
0
]);
platform
::
CudaAtomicAdd
(
&
in_pos
[
h_id
*
in_img_w
],
h1lambda
*
w2lambda
*
out_pos
[
0
]);
platform
::
CudaAtomicAdd
(
&
in_pos
[
h_id
*
in_img_w
+
w_id
],
h1lambda
*
w1lambda
*
out_pos
[
0
]);
}
}
template
<
typename
T
>
class
BilinearInterp
OpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
class
Interpolate
OpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"This kernel only runs on GPU device."
);
auto
*
input
_t
=
ctx
.
Input
<
Tensor
>
(
"X"
);
// float tensor
auto
*
output
_t
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
// float tensor
auto
*
input
=
input_
t
->
data
<
T
>
();
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
auto
*
input
_data
=
inpu
t
->
data
<
T
>
();
auto
interp_method
=
ctx
.
Attr
<
std
::
string
>
(
"interp_method"
);
int
out_h
=
ctx
.
Attr
<
int
>
(
"out_h"
);
int
out_w
=
ctx
.
Attr
<
int
>
(
"out_w"
);
auto
out_dims
=
output_t
->
dims
();
auto
out_size_t
=
ctx
.
Input
<
Tensor
>
(
"OutSize"
);
if
(
out_size_t
!=
nullptr
)
{
auto
out_size
=
ctx
.
Input
<
Tensor
>
(
"OutSize"
);
if
(
out_size
!=
nullptr
)
{
Tensor
sizes
;
framework
::
TensorCopy
(
*
out_size
_t
,
platform
::
CPUPlace
(),
&
sizes
);
framework
::
TensorCopy
(
*
out_size
,
platform
::
CPUPlace
(),
&
sizes
);
auto
size_data
=
sizes
.
data
<
int
>
();
out_h
=
size_data
[
0
];
out_w
=
size_data
[
1
];
}
auto
*
output
=
output_t
->
mutable_data
<
T
>
(
{
out_dims
[
0
],
out_dims
[
1
],
out_h
,
out_w
},
ctx
.
GetPlace
());
int
batch_size
=
input_t
->
dims
()[
0
];
int
channels
=
input_t
->
dims
()[
1
];
int
in_h
=
input_t
->
dims
()[
2
];
int
in_w
=
input_t
->
dims
()[
3
];
int
n
=
input
->
dims
()[
0
];
int
c
=
input
->
dims
()[
1
];
int
in_h
=
input
->
dims
()[
2
];
int
in_w
=
input
->
dims
()[
3
];
auto
*
output_data
=
output
->
mutable_data
<
T
>
({
n
,
c
,
out_h
,
out_w
},
ctx
.
GetPlace
());
int
in_hw
=
in_h
*
in_w
;
int
out_hw
=
out_h
*
out_w
;
int
in_chw
=
c
hannels
*
in_hw
;
int
out_chw
=
c
hannels
*
out_hw
;
int
in_chw
=
c
*
in_hw
;
int
out_chw
=
c
*
out_hw
;
T
ratio_h
=
(
out_h
>
1
)
?
static_cast
<
T
>
(
in_h
-
1
)
/
(
out_h
-
1
)
:
0.
f
;
T
ratio_w
=
(
out_w
>
1
)
?
static_cast
<
T
>
(
in_w
-
1
)
/
(
out_w
-
1
)
:
0.
f
;
float
ratio_h
=
(
out_h
>
1
)
?
static_cast
<
float
>
(
in_h
-
1
)
/
(
out_h
-
1
)
:
0.
f
;
float
ratio_w
=
(
out_w
>
1
)
?
static_cast
<
float
>
(
in_w
-
1
)
/
(
out_w
-
1
)
:
0.
f
;
if
(
in_h
==
out_h
&&
in_w
==
out_w
)
{
memcpy
(
output
,
input
,
input_t
->
numel
()
*
sizeof
(
T
));
}
else
{
int
threadNum
=
batch_size
*
out_chw
;
int
blocks
=
(
threadNum
+
1024
-
1
)
/
1024
;
framework
::
TensorCopy
(
*
input
,
ctx
.
GetPlace
(),
output
);
return
;
}
int
pixelNum
=
n
*
out_chw
;
int
grid_dim
=
(
pixelNum
+
512
-
1
)
/
512
;
grid_dim
=
grid_dim
>
8
?
8
:
grid_dim
;
if
(
"nearest"
==
interp_method
)
{
KeNearestNeighborInterpFw
<
T
><<<
grid_dim
,
512
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
input_data
,
in_h
,
in_w
,
n
,
in_chw
,
output_data
,
out_h
,
out_w
,
n
,
out_chw
,
c
,
ratio_h
,
ratio_w
);
}
else
if
(
"bilinear"
==
interp_method
)
{
KeBilinearInterpFw
<
T
><<<
blocks
,
1024
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
input
,
in_h
,
in_w
,
batch_size
,
in_chw
,
output
,
out_h
,
out_w
,
batch_size
,
out_chw
,
channels
,
ratio_h
,
ratio_w
);
T
><<<
grid_dim
,
512
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
input
_data
,
in_h
,
in_w
,
n
,
in_chw
,
output_data
,
out_h
,
out_w
,
n
,
out_chw
,
c
,
ratio_h
,
ratio_w
);
}
}
};
template
<
typename
T
>
class
BilinearInterp
GradOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
class
Interpolate
GradOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
d_input_t
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_output_t
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_output
=
d_output_t
->
data
<
T
>
();
auto
*
d_input
=
d_input_t
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
output_grad_data
=
output_grad
->
data
<
T
>
();
auto
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
device_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
zero
;
zero
(
device_ctx
,
d_input_t
,
static_cast
<
T
>
(
0.0
));
zero
(
device_ctx
,
input_grad
,
static_cast
<
T
>
(
0.0
));
auto
interp_method
=
ctx
.
Attr
<
std
::
string
>
(
"interp_method"
);
int
out_h
=
ctx
.
Attr
<
int
>
(
"out_h"
);
int
out_w
=
ctx
.
Attr
<
int
>
(
"out_w"
);
auto
out_size_t
=
ctx
.
Input
<
Tensor
>
(
"OutSize"
);
if
(
out_size_t
!=
nullptr
)
{
auto
out_size
=
ctx
.
Input
<
Tensor
>
(
"OutSize"
);
if
(
out_size
!=
nullptr
)
{
Tensor
sizes
;
framework
::
TensorCopy
(
*
out_size
_t
,
platform
::
CPUPlace
(),
&
sizes
);
framework
::
TensorCopy
(
*
out_size
,
platform
::
CPUPlace
(),
&
sizes
);
auto
size_data
=
sizes
.
data
<
int
>
();
out_h
=
size_data
[
0
];
out_w
=
size_data
[
1
];
}
int
batch_size
=
d_input_t
->
dims
()[
0
];
int
c
hannels
=
d_input_t
->
dims
()[
1
];
int
in_h
=
d_input_t
->
dims
()[
2
];
int
in_w
=
d_input_t
->
dims
()[
3
];
int
n
=
input_grad
->
dims
()[
0
];
int
c
=
input_grad
->
dims
()[
1
];
int
in_h
=
input_grad
->
dims
()[
2
];
int
in_w
=
input_grad
->
dims
()[
3
];
int
in_hw
=
in_h
*
in_w
;
int
out_hw
=
out_h
*
out_w
;
int
in_chw
=
c
hannels
*
in_hw
;
int
out_chw
=
c
hannels
*
out_hw
;
int
in_chw
=
c
*
in_hw
;
int
out_chw
=
c
*
out_hw
;
T
ratio_h
=
(
out_h
>
1
)
?
static_cast
<
T
>
(
in_h
-
1
)
/
(
out_h
-
1
)
:
0.
f
;
T
ratio_w
=
(
out_w
>
1
)
?
static_cast
<
T
>
(
in_w
-
1
)
/
(
out_w
-
1
)
:
0.
f
;
float
ratio_h
=
(
out_h
>
1
)
?
static_cast
<
float
>
(
in_h
-
1
)
/
(
out_h
-
1
)
:
0.
f
;
float
ratio_w
=
(
out_w
>
1
)
?
static_cast
<
float
>
(
in_w
-
1
)
/
(
out_w
-
1
)
:
0.
f
;
if
(
in_h
==
out_h
&&
in_w
==
out_w
)
{
memcpy
(
d_input
,
d_output
,
d_input_t
->
numel
()
*
sizeof
(
T
));
}
else
{
int
threadNum
=
batch_size
*
out_chw
;
int
blocks
=
(
threadNum
+
1024
-
1
)
/
1024
;
framework
::
TensorCopy
(
*
output_grad
,
ctx
.
GetPlace
(),
input_grad
);
return
;
}
int
pixelNum
=
n
*
out_chw
;
int
grid_dim
=
(
pixelNum
+
512
-
1
)
/
512
;
grid_dim
=
grid_dim
>
8
?
8
:
grid_dim
;
if
(
"nearest"
==
interp_method
)
{
KeNearestNeighborInterpBw
<
T
><<<
grid_dim
,
512
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
input_grad_data
,
in_h
,
in_w
,
n
,
in_chw
,
output_grad_data
,
out_h
,
out_w
,
n
,
out_chw
,
c
,
ratio_h
,
ratio_w
);
}
else
if
(
"bilinear"
==
interp_method
)
{
KeBilinearInterpBw
<
T
><<<
blocks
,
1024
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
d_input
,
in_h
,
in_w
,
batch_size
,
in_chw
,
d_output
,
out_h
,
out_w
,
batch_size
,
out_chw
,
channels
,
ratio_h
,
ratio_w
);
T
><<<
grid_dim
,
512
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
input_grad_data
,
in_h
,
in_w
,
n
,
in_chw
,
output_grad_data
,
out_h
,
out_w
,
n
,
out_chw
,
c
,
ratio_h
,
ratio_w
);
}
}
};
...
...
@@ -201,7 +284,9 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
bilinear_interp
,
ops
::
BilinearInterpOpCUDAKernel
<
float
>
);
REGISTER_OP_CUDA_KERNEL
(
bilinear_interp_grad
,
ops
::
BilinearInterpGradOpCUDAKernel
<
float
>
);
REGISTER_OP_CUDA_KERNEL
(
interpolate
,
ops
::
InterpolateOpCUDAKernel
<
float
>
,
ops
::
InterpolateOpCUDAKernel
<
double
>
,
ops
::
InterpolateOpCUDAKernel
<
int
>
);
REGISTER_OP_CUDA_KERNEL
(
interpolate_grad
,
ops
::
InterpolateGradOpCUDAKernel
<
float
>
,
ops
::
InterpolateGradOpCUDAKernel
<
double
>
);
paddle/fluid/operators/interpolate_op.h
0 → 100644
浏览文件 @
d231e550
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
,
size_t
D
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenTensor
=
framework
::
EigenTensor
<
T
,
D
,
MajorType
,
IndexType
>
;
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
>
static
void
NearestNeighborInterpolate
(
const
Tensor
&
input
,
Tensor
*
output
,
const
float
ratio_h
,
const
float
ratio_w
,
const
int
n
,
const
int
c
,
const
int
out_h
,
const
int
out_w
)
{
auto
input_t
=
EigenTensor
<
T
,
4
>::
From
(
input
);
auto
output_t
=
EigenTensor
<
T
,
4
>::
From
(
*
output
);
for
(
int
k
=
0
;
k
<
out_h
;
k
++
)
{
// loop for images
int
in_k
=
static_cast
<
int
>
(
ratio_h
*
k
+
0.5
);
for
(
int
l
=
0
;
l
<
out_w
;
l
++
)
{
int
in_l
=
static_cast
<
int
>
(
ratio_w
*
l
+
0.5
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
// loop for batches
for
(
int
j
=
0
;
j
<
c
;
j
++
)
{
// loop for channels
output_t
(
i
,
j
,
k
,
l
)
=
input_t
(
i
,
j
,
in_k
,
in_l
);
}
}
}
}
}
template
<
typename
T
>
static
void
BilinearInterpolation
(
const
Tensor
&
input
,
Tensor
*
output
,
const
float
ratio_h
,
const
float
ratio_w
,
const
int
in_h
,
const
int
in_w
,
const
int
n
,
const
int
c
,
const
int
out_h
,
const
int
out_w
)
{
auto
input_t
=
EigenTensor
<
T
,
4
>::
From
(
input
);
auto
output_t
=
EigenTensor
<
T
,
4
>::
From
(
*
output
);
for
(
int
k
=
0
;
k
<
out_h
;
k
++
)
{
// loop for images
int
y_n
=
static_cast
<
int
>
(
ratio_h
*
k
);
int
y_s
=
(
y_n
+
1
)
<
(
in_h
-
1
)
?
(
y_n
+
1
)
:
(
in_h
-
1
);
float
d_n
=
ratio_h
*
k
-
y_n
;
float
d_s
=
1.
f
-
d_n
;
for
(
int
l
=
0
;
l
<
out_w
;
l
++
)
{
int
x_w
=
static_cast
<
int
>
(
ratio_w
*
l
);
int
x_e
=
(
x_w
+
1
)
<
(
in_w
-
1
)
?
(
x_w
+
1
)
:
(
in_w
-
1
);
float
d_w
=
ratio_w
*
l
-
x_w
;
float
d_e
=
1.
f
-
d_w
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
// loop for batches
for
(
int
j
=
0
;
j
<
c
;
j
++
)
{
// loop for channels
// bilinear interpolation
output_t
(
i
,
j
,
k
,
l
)
=
input_t
(
i
,
j
,
y_n
,
x_w
)
*
d_s
*
d_e
+
input_t
(
i
,
j
,
y_s
,
x_w
)
*
d_n
*
d_e
+
input_t
(
i
,
j
,
y_n
,
x_e
)
*
d_s
*
d_w
+
input_t
(
i
,
j
,
y_s
,
x_e
)
*
d_n
*
d_w
;
}
}
}
}
}
template
<
typename
T
>
static
void
NearestNeighborInterpolateGrad
(
const
Tensor
&
output_grad
,
Tensor
*
input_grad
,
const
float
ratio_h
,
const
float
ratio_w
,
const
int
n
,
const
int
c
,
const
int
out_h
,
const
int
out_w
)
{
auto
input_grad_t
=
EigenTensor
<
T
,
4
>::
From
(
*
input_grad
);
auto
output_grad_t
=
EigenTensor
<
T
,
4
>::
From
(
output_grad
);
for
(
int
k
=
0
;
k
<
out_h
;
k
++
)
{
// loop for images
int
in_k
=
static_cast
<
int
>
(
ratio_h
*
k
+
0.5
);
for
(
int
l
=
0
;
l
<
out_w
;
l
++
)
{
int
in_l
=
static_cast
<
int
>
(
ratio_w
*
l
+
0.5
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
// loop for batches
for
(
int
j
=
0
;
j
<
c
;
j
++
)
{
// loop for channels
input_grad_t
(
i
,
j
,
in_k
,
in_l
)
+=
output_grad_t
(
i
,
j
,
k
,
l
);
}
}
}
}
}
template
<
typename
T
>
static
void
BilinearInterpolationGrad
(
const
Tensor
&
output_grad
,
Tensor
*
input_grad
,
const
float
ratio_h
,
const
float
ratio_w
,
const
int
in_h
,
const
int
in_w
,
const
int
n
,
const
int
c
,
const
int
out_h
,
const
int
out_w
)
{
auto
input_grad_t
=
EigenTensor
<
T
,
4
>::
From
(
*
input_grad
);
auto
output_grad_t
=
EigenTensor
<
T
,
4
>::
From
(
output_grad
);
for
(
int
k
=
0
;
k
<
out_h
;
k
++
)
{
// loop for images
int
y_n
=
static_cast
<
int
>
(
ratio_h
*
k
);
int
y_s
=
(
y_n
+
1
)
<
(
in_h
-
1
)
?
(
y_n
+
1
)
:
(
in_h
-
1
);
float
d_n
=
ratio_h
*
k
-
y_n
;
float
d_s
=
1.
f
-
d_n
;
for
(
int
l
=
0
;
l
<
out_w
;
l
++
)
{
int
x_w
=
static_cast
<
int
>
(
ratio_w
*
l
);
int
x_e
=
(
x_w
+
1
)
<
(
in_w
-
1
)
?
(
x_w
+
1
)
:
(
in_w
-
1
);
float
d_w
=
ratio_w
*
l
-
x_w
;
float
d_e
=
1.
f
-
d_w
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
// loop for batches
for
(
int
j
=
0
;
j
<
c
;
j
++
)
{
// loop for channels
// bilinear interpolation grad
const
T
grad
=
output_grad_t
(
i
,
j
,
k
,
l
);
input_grad_t
(
i
,
j
,
y_n
,
x_w
)
+=
static_cast
<
T
>
(
grad
*
d_s
*
d_e
);
input_grad_t
(
i
,
j
,
y_s
,
x_w
)
+=
static_cast
<
T
>
(
grad
*
d_n
*
d_e
);
input_grad_t
(
i
,
j
,
y_n
,
x_e
)
+=
static_cast
<
T
>
(
grad
*
d_s
*
d_w
);
input_grad_t
(
i
,
j
,
y_s
,
x_e
)
+=
static_cast
<
T
>
(
grad
*
d_n
*
d_w
);
}
}
}
}
}
template
<
typename
T
>
class
InterpolateKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
std
::
string
interp_method
=
ctx
.
Attr
<
std
::
string
>
(
"interp_method"
);
int
out_h
=
ctx
.
Attr
<
int
>
(
"out_h"
);
int
out_w
=
ctx
.
Attr
<
int
>
(
"out_w"
);
auto
out_size
=
ctx
.
Input
<
Tensor
>
(
"OutSize"
);
if
(
out_size
!=
nullptr
)
{
auto
out_size_data
=
out_size
->
data
<
int
>
();
out_h
=
out_size_data
[
0
];
out_w
=
out_size_data
[
1
];
}
const
int
n
=
input
->
dims
()[
0
];
const
int
c
=
input
->
dims
()[
1
];
const
int
in_h
=
input
->
dims
()[
2
];
const
int
in_w
=
input
->
dims
()[
3
];
output
->
mutable_data
<
T
>
({
n
,
c
,
out_h
,
out_w
},
ctx
.
GetPlace
());
auto
&
device_ctx
=
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>();
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
zero
;
zero
(
device_ctx
,
output
,
static_cast
<
T
>
(
0.0
));
if
(
in_h
==
out_h
&&
in_w
==
out_w
)
{
framework
::
TensorCopy
(
*
input
,
ctx
.
GetPlace
(),
output
);
return
;
}
float
ratio_h
=
(
out_h
>
1
)
?
static_cast
<
float
>
(
in_h
-
1
)
/
(
out_h
-
1
)
:
0.
f
;
float
ratio_w
=
(
out_w
>
1
)
?
static_cast
<
float
>
(
in_w
-
1
)
/
(
out_w
-
1
)
:
0.
f
;
if
(
"bilinear"
==
interp_method
)
{
BilinearInterpolation
<
T
>
(
*
input
,
output
,
ratio_h
,
ratio_w
,
in_h
,
in_w
,
n
,
c
,
out_h
,
out_w
);
}
else
if
(
"nearest"
==
interp_method
)
{
NearestNeighborInterpolate
<
T
>
(
*
input
,
output
,
ratio_h
,
ratio_w
,
n
,
c
,
out_h
,
out_w
);
}
}
};
template
<
typename
T
>
class
InterpolateGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
std
::
string
interp_method
=
ctx
.
Attr
<
std
::
string
>
(
"interp_method"
);
int
out_h
=
ctx
.
Attr
<
int
>
(
"out_h"
);
int
out_w
=
ctx
.
Attr
<
int
>
(
"out_w"
);
auto
out_size
=
ctx
.
Input
<
Tensor
>
(
"OutSize"
);
if
(
out_size
!=
nullptr
)
{
auto
out_size_data
=
out_size
->
data
<
int
>
();
out_h
=
out_size_data
[
0
];
out_w
=
out_size_data
[
1
];
}
const
int
n
=
input
->
dims
()[
0
];
const
int
c
=
input
->
dims
()[
1
];
const
int
in_h
=
input
->
dims
()[
2
];
const
int
in_w
=
input
->
dims
()[
3
];
input_grad
->
mutable_data
<
T
>
({
n
,
c
,
in_h
,
in_w
},
ctx
.
GetPlace
());
auto
&
device_ctx
=
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>();
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
zero
;
zero
(
device_ctx
,
input_grad
,
static_cast
<
T
>
(
0.0
));
if
(
in_h
==
out_h
&&
in_w
==
out_w
)
{
framework
::
TensorCopy
(
*
output_grad
,
ctx
.
GetPlace
(),
input_grad
);
return
;
}
float
ratio_h
=
(
out_h
>
1
)
?
static_cast
<
float
>
(
in_h
-
1
)
/
(
out_h
-
1
)
:
0.
f
;
float
ratio_w
=
(
out_w
>
1
)
?
static_cast
<
float
>
(
in_w
-
1
)
/
(
out_w
-
1
)
:
0.
f
;
if
(
"bilinear"
==
interp_method
)
{
BilinearInterpolationGrad
<
T
>
(
*
output_grad
,
input_grad
,
ratio_h
,
ratio_w
,
in_h
,
in_w
,
n
,
c
,
out_h
,
out_w
);
}
else
if
(
"nearest"
==
interp_method
)
{
NearestNeighborInterpolateGrad
<
T
>
(
*
output_grad
,
input_grad
,
ratio_h
,
ratio_w
,
n
,
c
,
out_h
,
out_w
);
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
d231e550
...
...
@@ -36,7 +36,7 @@ namespace operators {
void
RunServer
(
std
::
shared_ptr
<
distributed
::
RPCServer
>
service
)
{
service
->
StartServer
();
VLOG
(
4
)
<<
"RunServer thread end"
;
VLOG
(
4
0
)
<<
"RunServer thread end"
;
}
static
void
split
(
const
std
::
string
&
str
,
char
sep
,
std
::
vector
<
std
::
string
>
*
pieces
)
{
...
...
@@ -66,8 +66,8 @@ static void ParallelExecuteBlocks(
fs
.
push_back
(
framework
::
Async
([
&
executor
,
&
prepared
,
&
scope
,
idx
]()
{
int
run_block
=
idx
;
// thread local
try
{
VLOG
(
3
)
<<
"running server block: "
<<
run_block
<<
"pointer: "
<<
prepared
[
run_block
].
get
();
VLOG
(
3
0
)
<<
"running server block: "
<<
run_block
<<
"pointer: "
<<
prepared
[
run_block
].
get
();
executor
->
RunPreparedContext
(
prepared
[
run_block
].
get
(),
scope
);
}
catch
(
const
std
::
exception
&
e
)
{
LOG
(
FATAL
)
<<
"run sub program:"
<<
idx
<<
" error "
<<
e
.
what
();
...
...
@@ -108,7 +108,7 @@ void ListenAndServOp::RunSyncLoop(
framework
::
Scope
*
recv_scope
,
platform
::
DeviceContext
*
dev_ctx
,
const
std
::
vector
<
int
>
&
prefetch_block_id_list
,
const
int
checkpoint_point_block_id
)
const
{
VLOG
(
2
)
<<
"RunSyncLoop"
;
VLOG
(
2
0
)
<<
"RunSyncLoop"
;
size_t
num_blocks
=
program
->
Size
();
auto
optimize_blocks
=
Attr
<
std
::
vector
<
framework
::
BlockDesc
*>>
(
kOptimizeBlocks
);
...
...
@@ -167,7 +167,7 @@ void ListenAndServOp::RunSyncLoop(
}
ParallelExecuteBlocks
(
parallel_blkids
,
executor
,
optimize_prepared
,
program
,
recv_scope
);
VLOG
(
2
)
<<
"run all blocks spent "
<<
GetTimestamp
()
-
ts
<<
"(ms)"
;
VLOG
(
2
0
)
<<
"run all blocks spent "
<<
GetTimestamp
()
-
ts
<<
"(ms)"
;
ResetReceivedVars
(
recv_scope
,
dev_ctx
,
rpc_service_
->
NeedResetAllVars
());
...
...
@@ -183,11 +183,11 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
for
(
auto
&
varname
:
sparse_vars_
)
{
auto
var
=
recv_scope
->
FindVar
(
varname
);
if
(
var
==
nullptr
)
{
VLOG
(
2
)
<<
"can not find var "
<<
varname
<<
" in received scope"
;
VLOG
(
2
0
)
<<
"can not find var "
<<
varname
<<
" in received scope"
;
continue
;
}
if
(
var
->
IsType
<
framework
::
SelectedRows
>
())
{
VLOG
(
3
)
<<
"reset sparse var: "
<<
varname
;
VLOG
(
3
0
)
<<
"reset sparse var: "
<<
varname
;
var
->
GetMutable
<
framework
::
SelectedRows
>
()
->
mutable_rows
()
->
clear
();
}
else
{
PADDLE_THROW
(
"The type of sparse var should be SelectedRows"
);
...
...
@@ -197,7 +197,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
for
(
auto
&
varname
:
dense_vars_
)
{
auto
var
=
recv_scope
->
FindVar
(
varname
);
if
(
var
==
nullptr
)
{
VLOG
(
2
)
<<
"can not find var "
<<
varname
<<
" in received scope"
;
VLOG
(
2
0
)
<<
"can not find var "
<<
varname
<<
" in received scope"
;
continue
;
}
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
...
...
@@ -216,7 +216,7 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
void
ListenAndServOp
::
RunAsyncLoop
(
framework
::
Executor
*
executor
,
framework
::
ProgramDesc
*
program
,
framework
::
Scope
*
recv_scope
)
const
{
VLOG
(
2
)
<<
"RunAsyncLoop"
;
VLOG
(
2
0
)
<<
"RunAsyncLoop"
;
auto
grad_to_block_id_str
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"grad_to_block_id"
);
DoubleFindMap
<
std
::
string
,
int32_t
>
grad_to_block_id
;
...
...
@@ -225,7 +225,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
const
std
::
string
&
grad_and_id
)
{
std
::
vector
<
std
::
string
>
pieces
;
split
(
grad_and_id
,
':'
,
&
pieces
);
VLOG
(
3
)
<<
"after split, key = "
<<
pieces
[
0
]
<<
", id="
<<
pieces
[
1
];
VLOG
(
3
0
)
<<
"after split, key = "
<<
pieces
[
0
]
<<
", id="
<<
pieces
[
1
];
PADDLE_ENFORCE_EQ
(
pieces
.
size
(),
2
);
PADDLE_ENFORCE_EQ
(
out_map
->
count
(
pieces
[
0
]),
0
);
...
...
@@ -270,7 +270,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
while
(
true
)
{
if
(
rpc_service_
->
IsExit
())
{
VLOG
(
4
)
<<
"get exit!rpc_processor break!"
;
VLOG
(
4
0
)
<<
"get exit!rpc_processor break!"
;
break
;
}
...
...
@@ -332,9 +332,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
std
::
string
endpoint
=
Attr
<
std
::
string
>
(
"endpoint"
);
int
checkpoint_block_id
=
Attr
<
int
>
(
kCheckpointBlockId
);
VLOG
(
4
)
<<
"sync_mode:"
<<
sync_mode
<<
", fan_in:"
<<
fan_in
<<
", end_point:"
<<
endpoint
<<
", checkpoint_block_id: "
<<
checkpoint_block_id
;
VLOG
(
4
0
)
<<
"sync_mode:"
<<
sync_mode
<<
", fan_in:"
<<
fan_in
<<
", end_point:"
<<
endpoint
<<
", checkpoint_block_id: "
<<
checkpoint_block_id
;
rpc_service_
.
reset
(
new
RPCSERVER_T
(
endpoint
,
fan_in
));
...
...
@@ -383,8 +383,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
prefetch_var_name_to_block_id_str
)
{
std
::
vector
<
std
::
string
>
pieces
;
split
(
prefetch_var_name_and_id
,
':'
,
&
pieces
);
VLOG
(
3
)
<<
"after split, prefetch_var = "
<<
pieces
[
0
]
<<
", id="
<<
pieces
[
1
];
VLOG
(
3
0
)
<<
"after split, prefetch_var = "
<<
pieces
[
0
]
<<
", id="
<<
pieces
[
1
];
PADDLE_ENFORCE_EQ
(
pieces
.
size
(),
2
);
int
block_id
=
std
::
stoi
(
pieces
[
1
]);
...
...
@@ -415,7 +415,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
// start the server listening after all member initialized.
server_thread_
.
reset
(
new
std
::
thread
(
RunServer
,
rpc_service_
));
VLOG
(
3
)
<<
"wait server thread to become ready..."
;
VLOG
(
3
0
)
<<
"wait server thread to become ready..."
;
rpc_service_
->
WaitServerReady
();
// register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
...
...
paddle/fluid/operators/lod_rank_table_op.cc
浏览文件 @
d231e550
...
...
@@ -30,9 +30,9 @@ class LoDRankTableOp : public framework::OperatorBase {
auto
x
=
scope
.
FindVar
(
Input
(
"X"
))
->
Get
<
framework
::
LoDTensor
>
();
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
GetMutable
<
framework
::
LoDRankTable
>
();
VLOG
(
10
)
<<
"Level = "
<<
static_cast
<
size_t
>
(
Attr
<
int
>
(
"level"
));
VLOG
(
10
0
)
<<
"Level = "
<<
static_cast
<
size_t
>
(
Attr
<
int
>
(
"level"
));
out
->
Reset
(
x
.
lod
(),
static_cast
<
size_t
>
(
Attr
<
int
>
(
"level"
)));
VLOG
(
10
)
<<
Input
(
"X"
)
<<
"'s lod information is "
<<
*
out
;
VLOG
(
10
0
)
<<
Input
(
"X"
)
<<
"'s lod information is "
<<
*
out
;
}
};
...
...
paddle/fluid/operators/lookup_table_op.cc
浏览文件 @
d231e550
...
...
@@ -134,13 +134,13 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
auto
attr
=
op_desc
.
GetAttr
(
"is_sparse"
);
bool
is_sparse
=
boost
::
get
<
bool
>
(
attr
);
if
(
is_sparse
)
{
VLOG
(
3
)
<<
"lookup_table_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to SelectedRows"
;
VLOG
(
3
0
)
<<
"lookup_table_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to SelectedRows"
;
block
->
Var
(
out_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
}
else
{
VLOG
(
3
)
<<
"lookup_table_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to LoDTensor"
;
VLOG
(
3
0
)
<<
"lookup_table_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to LoDTensor"
;
block
->
Var
(
out_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
block
->
Var
(
out_var_name
)
->
SetDataType
(
block
->
Var
(
"W"
)
->
GetDataType
());
...
...
paddle/fluid/operators/math/cpu_vec_test.cc
浏览文件 @
d231e550
...
...
@@ -96,8 +96,8 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
}
auto
et
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
n
<<
": refer takes: "
<<
(
et
-
mt
)
/
repeat
<<
" us, tgt takes: "
<<
(
mt
-
st
)
/
repeat
;
VLOG
(
3
0
)
<<
"Vec size "
<<
n
<<
": refer takes: "
<<
(
et
-
mt
)
/
repeat
<<
" us, tgt takes: "
<<
(
mt
-
st
)
/
repeat
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
EXPECT_NEAR
(
ytgt_data
[
i
],
yref_data
[
i
],
1e-3
);
}
...
...
paddle/fluid/operators/math/jit_kernel_test.cc
浏览文件 @
d231e550
...
...
@@ -87,7 +87,7 @@ TEST(JitKernel, vrelu) {
vrelu_intri8
(
d
,
x_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
VLOG
(
3
0
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
}
#endif
auto
ttgts
=
GetCurrentUS
();
...
...
@@ -95,8 +95,9 @@ TEST(JitKernel, vrelu) {
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -132,8 +133,9 @@ TEST(JitKernel, vaddbias) {
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -183,13 +185,14 @@ TEST(JitKernel, vexp) {
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -254,9 +257,10 @@ TEST(JitKernel, vsigmoid) {
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -321,9 +325,10 @@ TEST(JitKernel, vtanh) {
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit exp) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -441,9 +446,10 @@ TEST(JitKernel, lstm) {
ker
->
ComputeCtHt
(
x_data
,
ct_1_data
,
ct_tgt_data
,
ht_tgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better(jit) takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
}
}
...
...
@@ -525,8 +531,8 @@ TEST(JitKernel, vscal) {
vscal_inp_intri8
(
d
,
a
,
y_data
);
}
auto
si3
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
<<
" us, inplace: "
<<
(
si3
-
si2
)
/
repeat
;
VLOG
(
3
0
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
<<
" us, inplace: "
<<
(
si3
-
si2
)
/
repeat
;
}
#endif
...
...
@@ -540,15 +546,17 @@ TEST(JitKernel, vscal) {
ker
->
Compute
(
&
a
,
y_data
,
y_data
,
d
);
}
auto
ttgte1
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, inplace takes: "
<<
(
trefe1
-
trefs1
)
/
repeat
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, inplace takes: "
<<
(
trefe1
-
trefs1
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl inplace takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
" us, mkl inplace takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
"us, tgt inplace takes: "
<<
(
ttgte1
-
ttgts1
)
/
repeat
;
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
<<
"us, tgt inplace takes: "
<<
(
ttgte1
-
ttgts1
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -611,7 +619,7 @@ TEST(JitKernel, vmul) {
vmul_intri8
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
VLOG
(
3
0
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
}
#endif
...
...
@@ -621,13 +629,14 @@ TEST(JitKernel, vmul) {
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -690,7 +699,7 @@ TEST(JitKernel, vadd) {
vadd_intri8
(
d
,
x_data
,
y_data
,
zref_data
);
}
auto
si1
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
VLOG
(
3
0
)
<<
"Vec size 8 intr takes: "
<<
(
si1
-
si0
)
/
repeat
;
}
#endif
...
...
@@ -700,13 +709,14 @@ TEST(JitKernel, vadd) {
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
#ifdef PADDLE_WITH_MKLML
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
" us, mkl takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
#else
<<
" us, "
<<
" us, "
#endif
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
@@ -761,9 +771,10 @@ TEST(JitKernel, vaddrelu) {
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
,
d
);
}
auto
ttgte
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
VLOG
(
30
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
<<
" us, better takes: "
<<
(
tmkle
-
tmkls
)
/
repeat
<<
" us, "
<<
"tgt takes: "
<<
(
ttgte
-
ttgts
)
/
repeat
;
for
(
int
i
=
0
;
i
<
d
;
++
i
)
{
EXPECT_NEAR
(
ztgt_data
[
i
],
zref_data
[
i
],
1e-3
);
}
...
...
paddle/fluid/operators/math/selected_rows_functor.cc
浏览文件 @
d231e550
...
...
@@ -270,7 +270,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
const
std
::
vector
<
const
framework
::
SelectedRows
*>&
inputs
,
framework
::
SelectedRows
*
output
)
{
if
(
inputs
.
size
()
==
0
)
{
VLOG
(
3
)
<<
"no input! return"
;
VLOG
(
3
0
)
<<
"no input! return"
;
return
;
}
const
framework
::
SelectedRows
*
has_value_input
=
nullptr
;
...
...
@@ -281,7 +281,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
}
}
if
(
has_value_input
==
nullptr
)
{
VLOG
(
3
)
<<
"no input has value! just return"
<<
std
::
endl
;
VLOG
(
3
0
)
<<
"no input has value! just return"
<<
std
::
endl
;
return
;
}
auto
input_width
=
has_value_input
->
value
().
dims
()[
1
];
...
...
paddle/fluid/operators/math/selected_rows_functor.cu
浏览文件 @
d231e550
...
...
@@ -314,7 +314,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
const
std
::
vector
<
const
framework
::
SelectedRows
*>&
inputs
,
framework
::
SelectedRows
*
output
)
{
if
(
inputs
.
size
()
==
0
)
{
VLOG
(
3
)
<<
"no input! return"
;
VLOG
(
3
0
)
<<
"no input! return"
;
return
;
}
const
framework
::
SelectedRows
*
has_value_input
=
nullptr
;
...
...
@@ -325,7 +325,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
}
}
if
(
has_value_input
==
nullptr
)
{
VLOG
(
3
)
<<
"no input has value! just return"
<<
std
::
endl
;
VLOG
(
3
0
)
<<
"no input has value! just return"
<<
std
::
endl
;
return
;
}
auto
input_width
=
has_value_input
->
value
().
dims
()[
1
];
...
...
paddle/fluid/operators/mean_op.cc
浏览文件 @
d231e550
...
...
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mean_op.h"
#include <string>
namespace
paddle
{
namespace
operators
{
...
...
@@ -42,6 +42,14 @@ Mean Operator calculates the mean of all elements in X.
}
};
class
MeanOpInferVarType
:
public
framework
::
PassInDtypeAndVarTypeToOutput
{
protected:
std
::
unordered_map
<
std
::
string
,
std
::
string
>
GetInputOutputWithSameType
()
const
override
{
return
std
::
unordered_map
<
std
::
string
,
std
::
string
>
{{
"X"
,
/*->*/
"Out"
}};
}
};
class
MeanGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
...
...
@@ -50,6 +58,14 @@ class MeanGradOp : public framework::OperatorWithKernel {
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
ctx
->
ShareLoD
(
"X"
,
framework
::
GradVarName
(
"X"
));
}
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
());
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
());
}
};
class
MeanGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
...
...
@@ -71,7 +87,8 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
mean
,
ops
::
MeanOp
,
ops
::
MeanOpMaker
,
ops
::
MeanGradMaker
);
REGISTER_OPERATOR
(
mean
,
ops
::
MeanOp
,
ops
::
MeanOpMaker
,
ops
::
MeanOpInferVarType
,
ops
::
MeanGradMaker
);
REGISTER_OPERATOR
(
mean_grad
,
ops
::
MeanGradOp
);
REGISTER_OP_CPU_KERNEL
(
mean
,
ops
::
MeanKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/momentum_op.h
浏览文件 @
d231e550
...
...
@@ -346,7 +346,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
// sparse update maybe empty.
if
(
grad
->
rows
().
size
()
==
0
)
{
VLOG
(
3
)
<<
"Grad SelectedRows contains no data!"
;
VLOG
(
3
0
)
<<
"Grad SelectedRows contains no data!"
;
return
;
}
auto
*
merged_grad
=
const_cast
<
framework
::
Scope
&>
(
ctx
.
scope
())
...
...
paddle/fluid/operators/mul_op.cc
浏览文件 @
d231e550
...
...
@@ -38,9 +38,9 @@ class MulOp : public framework::OperatorWithKernel {
int
x_num_col_dims
=
ctx
->
Attrs
().
Get
<
int
>
(
"x_num_col_dims"
);
int
y_num_col_dims
=
ctx
->
Attrs
().
Get
<
int
>
(
"y_num_col_dims"
);
VLOG
(
3
)
<<
"mul operator x.shape="
<<
x_dims
<<
" y.shape="
<<
y_dims
<<
" x_num_col_dims="
<<
x_num_col_dims
<<
" y_num_col_dims="
<<
y_num_col_dims
;
VLOG
(
3
0
)
<<
"mul operator x.shape="
<<
x_dims
<<
" y.shape="
<<
y_dims
<<
" x_num_col_dims="
<<
x_num_col_dims
<<
" y_num_col_dims="
<<
y_num_col_dims
;
PADDLE_ENFORCE_GT
(
x_dims
.
size
(),
x_num_col_dims
,
...
...
@@ -126,6 +126,14 @@ or not. But the output only shares the LoD information with input $X$.
}
};
class
MulOpInferVarType
:
public
framework
::
PassInDtypeAndVarTypeToOutput
{
protected:
std
::
unordered_map
<
std
::
string
,
std
::
string
>
GetInputOutputWithSameType
()
const
override
{
return
std
::
unordered_map
<
std
::
string
,
std
::
string
>
{{
"X"
,
/*->*/
"Out"
}};
}
};
class
MulGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
...
...
@@ -178,7 +186,8 @@ class MulOpGradMaker : public framework::SingleGradOpDescMaker {
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
mul
,
ops
::
MulOp
,
ops
::
MulOpMaker
,
ops
::
MulOpGradMaker
);
REGISTER_OPERATOR
(
mul
,
ops
::
MulOp
,
ops
::
MulOpMaker
,
ops
::
MulOpInferVarType
,
ops
::
MulOpGradMaker
);
REGISTER_OPERATOR
(
mul_grad
,
ops
::
MulGradOp
);
REGISTER_OP_CPU_KERNEL
(
mul
,
ops
::
MulKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/nccl_op.cu.cc
浏览文件 @
d231e550
...
...
@@ -63,16 +63,16 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
// device id
int
gpu_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
()).
GetDeviceId
();
int
idx
=
comm
->
GetCommId
(
gpu_id
);
VLOG
(
3
)
<<
"gpu : "
<<
" invoke allreduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
VLOG
(
3
0
)
<<
"gpu : "
<<
" invoke allreduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
x
->
data
<
T
>
(),
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
out
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
VLOG
(
3
)
<<
"gpu : "
<<
" finished allreduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
VLOG
(
3
0
)
<<
"gpu : "
<<
" finished allreduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
}
};
...
...
@@ -109,14 +109,14 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
}
else
{
out
->
Resize
(
framework
::
make_ddim
({
0
}));
}
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke reduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
VLOG
(
3
0
)
<<
"gpu : "
<<
gpu_id
<<
" invoke reduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclReduce
(
x
->
data
<
T
>
(),
recvbuffer
,
x
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
reduction_op_
,
root
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" finished reduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
VLOG
(
3
0
)
<<
"gpu : "
<<
gpu_id
<<
" finished reduce. send "
<<
x
->
numel
()
<<
" recv "
<<
out
->
numel
();
}
};
...
...
@@ -133,21 +133,22 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
int
idx
=
comm
->
GetCommId
(
gpu_id
);
if
(
idx
==
root
)
{
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. send "
<<
x
->
numel
();
VLOG
(
3
0
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. send "
<<
x
->
numel
();
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
x
->
data
<
T
>
())),
x
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast."
;
VLOG
(
3
0
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast."
;
}
else
{
auto
*
out
=
ctx
.
Output
<
LoDTensor
>
(
"Out"
);
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. recv buffer "
<<
framework
::
product
(
out
->
dims
());
VLOG
(
3
0
)
<<
"gpu : "
<<
gpu_id
<<
" invoke Bcast. recv buffer "
<<
framework
::
product
(
out
->
dims
());
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclBcast
(
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()),
out
->
numel
(),
NCCLTypeWrapper
<
T
>::
type
,
root
,
comm
->
comms
().
at
(
idx
),
ctx
.
cuda_device_context
().
stream
()));
VLOG
(
3
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast. recv "
<<
out
->
numel
();
VLOG
(
30
)
<<
"gpu : "
<<
gpu_id
<<
" finished Bcast. recv "
<<
out
->
numel
();
}
}
};
...
...
paddle/fluid/operators/nccl_op_test.cu.cc
浏览文件 @
d231e550
...
...
@@ -86,9 +86,9 @@ class NCCLTester : public ::testing::Test {
(
*
p_scopes
).
resize
(
gpu_list_
.
size
());
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op1
);
VLOG
(
1
)
<<
"invoke NCCLInitOp."
;
VLOG
(
1
0
)
<<
"invoke NCCLInitOp."
;
op
->
Run
(
g_scope_
,
cpu_place
);
VLOG
(
1
)
<<
"NCCLInitOp finished."
;
VLOG
(
1
0
)
<<
"NCCLInitOp finished."
;
}
int
GetGPUData
(
int
gpu_id
)
{
return
gpu_id
+
42
;
}
...
...
@@ -109,7 +109,7 @@ class NCCLTester : public ::testing::Test {
std
::
vector
<
T
>
send_vector
(
f
::
product
(
kDims
),
GetGPUData
(
gpu_id
));
paddle
::
framework
::
TensorFromVector
<
T
>
(
send_vector
,
*
ctx
,
send_tensor
);
VLOG
(
1
)
<<
"Send Tensor filled with elements "
<<
send_tensor
->
numel
();
VLOG
(
1
0
)
<<
"Send Tensor filled with elements "
<<
send_tensor
->
numel
();
}
lk
.
unlock
();
...
...
@@ -119,11 +119,11 @@ class NCCLTester : public ::testing::Test {
auto
op
=
f
::
OpRegistry
::
CreateOp
(
*
op1
);
VLOG
(
1
)
<<
"Device : "
<<
gpu_id
<<
" invoke "
<<
op_desc
.
Type
();
VLOG
(
1
)
<<
" send_tensor : "
<<
send_tensor
->
numel
()
<<
" recv_tensor : "
<<
recv_tensor
->
numel
();
VLOG
(
1
0
)
<<
"Device : "
<<
gpu_id
<<
" invoke "
<<
op_desc
.
Type
();
VLOG
(
1
0
)
<<
" send_tensor : "
<<
send_tensor
->
numel
()
<<
" recv_tensor : "
<<
recv_tensor
->
numel
();
op
->
Run
(
*
scope
,
place
);
VLOG
(
1
)
<<
"Device : "
<<
gpu_id
<<
" finished "
<<
op_desc
.
Type
();
VLOG
(
1
0
)
<<
"Device : "
<<
gpu_id
<<
" finished "
<<
op_desc
.
Type
();
}
public:
...
...
paddle/fluid/operators/parallel_do_op.cc
浏览文件 @
d231e550
...
...
@@ -48,7 +48,7 @@ static void SplitTensorAndMoveTensorToScopes(
auto
lod_tensors
=
tensor
.
SplitLoDTensor
(
places
);
for
(
auto
&
lod
:
lod_tensors
)
{
VLOG
(
3
)
<<
lod
.
dims
();
VLOG
(
3
0
)
<<
lod
.
dims
();
}
if
(
num_sub_scopes
==
0
)
{
num_sub_scopes
=
lod_tensors
.
size
();
...
...
@@ -263,7 +263,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
if
(
s
==
framework
::
kEmptyVarName
)
{
continue
;
}
VLOG
(
3
)
<<
"Moving "
<<
s
;
VLOG
(
3
0
)
<<
"Moving "
<<
s
;
CopyOrShare
(
*
sub_scopes
[
0
]
->
FindVar
(
s
),
place
,
scope
.
FindVar
(
s
));
}
WaitOnPlaces
(
places
);
...
...
@@ -277,7 +277,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
if
(
s
==
framework
::
kEmptyVarName
)
{
continue
;
}
VLOG
(
3
)
<<
"Accumulating "
<<
s
;
VLOG
(
3
0
)
<<
"Accumulating "
<<
s
;
if
(
s
==
framework
::
kEmptyVarName
)
continue
;
std
::
string
tmp_name
;
auto
*
tmp
=
sub_scopes
[
0
]
->
Var
(
&
tmp_name
);
...
...
@@ -289,7 +289,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
auto
sum_op
=
framework
::
OpRegistry
::
CreateOp
(
"sum"
,
{{
"X"
,
{
s
,
tmp_name
}}},
{{
"Out"
,
{
s
}}},
framework
::
AttributeMap
{{
"use_mkldnn"
,
{
false
}}});
VLOG
(
10
)
<<
sum_op
->
DebugStringEx
(
sub_scopes
[
0
]);
VLOG
(
10
0
)
<<
sum_op
->
DebugStringEx
(
sub_scopes
[
0
]);
sum_op
->
Run
(
*
sub_scopes
[
0
],
places
[
0
]);
WaitOnPlace
(
places
[
0
]);
}
...
...
@@ -316,7 +316,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
auto
*
grad
=
new
framework
::
OpDesc
();
grad
->
SetType
(
"parallel_do_grad"
);
for
(
auto
&
input_param
:
this
->
InputNames
())
{
VLOG
(
3
)
<<
input_param
;
VLOG
(
3
0
)
<<
input_param
;
grad
->
SetInput
(
input_param
,
this
->
Input
(
input_param
));
if
(
input_param
!=
kPlaces
)
{
grad
->
SetOutput
(
framework
::
GradVarName
(
input_param
),
...
...
paddle/fluid/operators/pool_op.cc
浏览文件 @
d231e550
...
...
@@ -40,7 +40,7 @@ int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
return
output_size
;
}
void
PoolOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
void
PoolOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"X(Input) of Pooling should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Out(Output) of Pooling should not be null."
);
...
...
@@ -81,7 +81,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
}
framework
::
OpKernelType
PoolOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
...
...
@@ -104,7 +104,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
layout_
,
library_
);
}
void
PoolOpGrad
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
void
PoolOpGrad
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) must not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)),
"Input(X@GRAD) should not be null."
);
...
...
@@ -112,7 +112,7 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
}
framework
::
OpKernelType
PoolOpGrad
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
framework
::
DataLayout
layout_
=
framework
::
StringToDataLayout
(
data_format
);
...
...
@@ -262,6 +262,14 @@ Example:
)DOC"
);
}
class
PoolOpInferVarType
:
public
framework
::
PassInDtypeAndVarTypeToOutput
{
protected:
std
::
unordered_map
<
std
::
string
,
std
::
string
>
GetInputOutputWithSameType
()
const
override
{
return
std
::
unordered_map
<
std
::
string
,
std
::
string
>
{{
"X"
,
/*->*/
"Out"
}};
}
};
void
Pool3dOpMaker
::
Make
()
{
AddInput
(
"X"
,
"(Tensor) The input tensor of pooling operator. "
...
...
@@ -372,6 +380,7 @@ Example:
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
pool2d
,
ops
::
PoolOp
,
ops
::
Pool2dOpMaker
,
ops
::
PoolOpInferVarType
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
pool2d_grad
,
ops
::
PoolOpGrad
);
...
...
@@ -383,6 +392,7 @@ REGISTER_OP_CPU_KERNEL(
ops
::
PoolGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OPERATOR
(
pool3d
,
ops
::
PoolOp
,
ops
::
Pool3dOpMaker
,
ops
::
PoolOpInferVarType
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
pool3d_grad
,
ops
::
PoolOpGrad
);
...
...
paddle/fluid/operators/prefetch_op.cc
浏览文件 @
d231e550
...
...
@@ -48,12 +48,12 @@ class PrefetchOp : public framework::OperatorBase {
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
i
++
)
{
if
(
NeedSend
(
scope
,
ins
[
i
]))
{
VLOG
(
3
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
]
<<
" to get "
<<
outs
[
i
]
<<
" back"
;
VLOG
(
3
0
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
]
<<
" to get "
<<
outs
[
i
]
<<
" back"
;
rets
.
push_back
(
rpc_client
->
AsyncPrefetchVar
(
epmap
[
i
],
ctx
,
scope
,
ins
[
i
],
outs
[
i
]));
}
else
{
VLOG
(
3
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
VLOG
(
3
0
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
}
}
for
(
size_t
i
=
0
;
i
<
rets
.
size
();
i
++
)
{
...
...
paddle/fluid/operators/random_crop_op.h
浏览文件 @
d231e550
...
...
@@ -155,8 +155,8 @@ class RandomCropKernel : public framework::OpKernel<T> {
seed
=
*
cpu_seed
.
data
<
int64_t
>
();
}
}
else
{
VLOG
(
5
)
<<
"WARNING: The input 'Seed' is not initialized, use attribute "
"'startup_seed' instead."
;
VLOG
(
5
0
)
<<
"WARNING: The input 'Seed' is not initialized, use attribute "
"'startup_seed' instead."
;
seed
=
ctx
.
Attr
<
int
>
(
"startup_seed"
);
}
auto
shape
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"shape"
);
...
...
paddle/fluid/operators/reader/blocking_queue.h
浏览文件 @
d231e550
...
...
@@ -42,7 +42,7 @@ class BlockingQueue {
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
send_cv_
.
wait
(
lock
,
[
&
]
{
return
queue_
.
size
()
<
capacity_
||
closed_
;
});
if
(
closed_
)
{
VLOG
(
5
)
VLOG
(
5
0
)
<<
"WARNING: Sending an element to a closed reader::BlokcingQueue."
;
return
false
;
}
...
...
@@ -56,7 +56,7 @@ class BlockingQueue {
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
send_cv_
.
wait
(
lock
,
[
&
]
{
return
queue_
.
size
()
<
capacity_
||
closed_
;
});
if
(
closed_
)
{
VLOG
(
5
)
VLOG
(
5
0
)
<<
"WARNING: Sending an element to a closed reader::BlokcingQueue."
;
return
false
;
}
...
...
paddle/fluid/operators/reader/create_shuffle_reader_op.cc
浏览文件 @
d231e550
...
...
@@ -26,7 +26,7 @@ class ShuffleReader : public framework::DecoratedReader {
ShuffleReader
(
const
std
::
shared_ptr
<
ReaderBase
>&
reader
,
size_t
buffer_size
,
size_t
seed
=
0
)
:
DecoratedReader
(
reader
),
buffer_size_
(
buffer_size
),
seed_
(
seed
)
{
VLOG
(
10
)
<<
"Create shuffle reader of "
<<
reader_
;
VLOG
(
10
0
)
<<
"Create shuffle reader of "
<<
reader_
;
if
(
seed_
==
0
)
{
std
::
random_device
device
;
seed_
=
device
();
...
...
@@ -37,7 +37,7 @@ class ShuffleReader : public framework::DecoratedReader {
void
ReadNextImpl
(
std
::
vector
<
framework
::
LoDTensor
>*
out
)
override
{
out
->
clear
();
if
(
iteration_pos_
>=
buffer_
.
size
())
{
VLOG
(
10
)
<<
"Resetting shuffle buffer"
;
VLOG
(
10
0
)
<<
"Resetting shuffle buffer"
;
ReloadBuffer
();
if
(
buffer_
.
empty
())
{
return
;
...
...
@@ -73,7 +73,7 @@ class ShuffleReader : public framework::DecoratedReader {
std
::
mt19937
g
(
seed_
);
std
::
shuffle
(
buffer_
.
begin
(),
buffer_
.
end
(),
g
);
seed_
=
g
();
// update seed_;
VLOG
(
10
)
<<
"random buffer size = "
<<
buffer_
.
size
();
VLOG
(
10
0
)
<<
"random buffer size = "
<<
buffer_
.
size
();
}
size_t
buffer_size_
;
...
...
paddle/fluid/operators/recurrent_op.cc
浏览文件 @
d231e550
...
...
@@ -160,7 +160,7 @@ class RecurrentBase : public framework::OperatorBase {
Callback
callback
)
{
PADDLE_ENFORCE_EQ
(
src_vars
.
size
(),
dst_vars
.
size
());
for
(
size_t
i
=
0
;
i
<
dst_vars
.
size
();
++
i
)
{
VLOG
(
10
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
VLOG
(
10
0
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
AccessTensor
(
src_scope
,
src_vars
[
i
],
dst_scope
,
dst_vars
[
i
],
callback
);
}
}
...
...
@@ -176,7 +176,7 @@ class RecurrentBase : public framework::OperatorBase {
Callback
callback
)
{
PADDLE_ENFORCE_EQ
(
src_vars
.
size
(),
dst_vars
.
size
());
for
(
size_t
i
=
0
;
i
<
dst_vars
.
size
();
++
i
)
{
VLOG
(
10
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
VLOG
(
10
0
)
<<
"Link "
<<
src_vars
[
i
]
<<
" to "
<<
dst_vars
[
i
];
AccessTensor
(
src_scope
,
src_vars
[
i
],
dst_scope
,
dst_vars
[
i
],
callback
);
}
}
...
...
@@ -230,7 +230,7 @@ class RecurrentOp : public RecurrentBase {
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{
auto
seq_len
=
static_cast
<
size_t
>
(
this
->
GetSequenceLength
(
scope
));
VLOG
(
3
)
<<
"Static RNN input sequence length = "
<<
seq_len
;
VLOG
(
3
0
)
<<
"Static RNN input sequence length = "
<<
seq_len
;
StepScopes
scopes
=
CreateStepScopes
(
scope
,
seq_len
);
auto
reverse
=
Attr
<
bool
>
(
kReverse
);
...
...
@@ -241,7 +241,7 @@ class RecurrentOp : public RecurrentBase {
for
(
size_t
i
=
0
;
i
<
seq_len
;
++
i
)
{
size_t
seq_offset
=
reverse
?
seq_len
-
i
-
1
:
i
;
VLOG
(
3
)
<<
"Recurrent operate at the time step "
<<
seq_offset
;
VLOG
(
3
0
)
<<
"Recurrent operate at the time step "
<<
seq_offset
;
auto
&
cur_scope
=
scopes
.
CurScope
();
...
...
@@ -334,7 +334,7 @@ class RecurrentGradOp : public RecurrentBase {
for
(
size_t
step_id
=
0
;
step_id
<
seq_len
;
++
step_id
)
{
size_t
seq_offset
=
reverse
?
step_id
:
seq_len
-
step_id
-
1
;
VLOG
(
3
)
<<
"Recurrent backward operate at the time step "
<<
seq_offset
;
VLOG
(
3
0
)
<<
"Recurrent backward operate at the time step "
<<
seq_offset
;
auto
&
cur_scope
=
scopes
.
CurScope
();
// Link outside::output_grads --> inside::output_grads
// inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
...
...
@@ -348,11 +348,11 @@ class RecurrentGradOp : public RecurrentBase {
});
auto
og_set
=
List2Set
(
Inputs
(
kOutputGrads
));
if
(
VLOG_IS_ON
(
10
))
{
if
(
VLOG_IS_ON
(
10
0
))
{
std
::
ostringstream
sout
;
std
::
copy
(
og_set
.
begin
(),
og_set
.
end
(),
std
::
ostream_iterator
<
std
::
string
>
(
sout
,
","
));
VLOG
(
10
)
<<
" RNN output gradients = ["
<<
sout
.
str
()
<<
"]"
;
VLOG
(
10
0
)
<<
" RNN output gradients = ["
<<
sout
.
str
()
<<
"]"
;
}
// Link states
...
...
@@ -374,7 +374,7 @@ class RecurrentGradOp : public RecurrentBase {
auto
&
ex_tensor
=
ex_scope
.
FindVar
(
ex_grad
)
->
Get
<
framework
::
LoDTensor
>
();
VLOG
(
10
)
<<
" RNN link "
<<
cur_grad
<<
" from "
<<
ex_grad
;
VLOG
(
10
0
)
<<
" RNN link "
<<
cur_grad
<<
" from "
<<
ex_grad
;
auto
*
cur_grad_var
=
cur_scope
.
Var
(
cur_grad
);
auto
cur_grad_tensor
=
cur_grad_var
->
GetMutable
<
framework
::
LoDTensor
>
();
...
...
@@ -382,12 +382,12 @@ class RecurrentGradOp : public RecurrentBase {
}
}
VLOG
(
5
)
<<
"Recurrent memory linking finished "
;
VLOG
(
5
0
)
<<
"Recurrent memory linking finished "
;
// Run step block with cur_scope
executor
.
Run
(
*
program
,
&
cur_scope
,
block
->
ID
(),
false
/*create_local_scope*/
);
VLOG
(
5
)
<<
"executor.Run finished "
;
VLOG
(
5
0
)
<<
"executor.Run finished "
;
auto
local_var_names
=
LocalVarNames
(
cur_scope
);
...
...
@@ -436,7 +436,7 @@ class RecurrentGradOp : public RecurrentBase {
cur_scope
.
Rename
(
new_inside_name
,
inside_grad_name
);
}
}
VLOG
(
5
)
<<
"Accumulate Parameter finished "
;
VLOG
(
5
0
)
<<
"Accumulate Parameter finished "
;
// Copy input gradient from inside to outside
// outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad
...
...
@@ -455,7 +455,7 @@ class RecurrentGradOp : public RecurrentBase {
auto
dst
=
outside
->
Slice
(
seq_offset
,
seq_offset
+
1
);
framework
::
TensorCopy
(
inside
,
place
,
dev_ctx
,
&
dst
);
});
VLOG
(
5
)
<<
"Link outside gradient finished "
;
VLOG
(
5
0
)
<<
"Link outside gradient finished "
;
if
(
step_id
+
1
==
seq_len
)
{
// at_end
// copy initialize states gradient from inside to outside
...
...
@@ -468,7 +468,7 @@ class RecurrentGradOp : public RecurrentBase {
outside
->
mutable_data
(
place
,
inside
.
type
());
framework
::
TensorCopy
(
inside
,
place
,
dev_ctx
,
outside
);
});
VLOG
(
5
)
<<
"Link initialize state gradient finished "
;
VLOG
(
5
0
)
<<
"Link initialize state gradient finished "
;
}
scopes
.
Next
();
}
...
...
paddle/fluid/operators/recv_op.cc
浏览文件 @
d231e550
...
...
@@ -47,7 +47,7 @@ class RecvOp : public framework::OperatorBase {
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
for
(
size_t
i
=
0
;
i
<
outs
.
size
();
i
++
)
{
VLOG
(
3
)
<<
"getting "
<<
outs
[
i
]
<<
" from "
<<
epmap
[
i
];
VLOG
(
3
0
)
<<
"getting "
<<
outs
[
i
]
<<
" from "
<<
epmap
[
i
];
rets
.
push_back
(
rpc_client
->
AsyncGetVar
(
epmap
[
i
],
ctx
,
scope
,
outs
[
i
]));
}
if
(
sync_mode
)
{
...
...
paddle/fluid/operators/rnn_memory_helper_op.cc
浏览文件 @
d231e550
...
...
@@ -93,7 +93,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
in_grad_var_name
);
if
(
out_grad_var
==
nullptr
)
{
VLOG
(
5
)
<<
"Using fill constant 0 as starting gradient"
;
VLOG
(
5
0
)
<<
"Using fill constant 0 as starting gradient"
;
auto
in_var_name
=
Input
(
"X"
);
auto
*
in_var
=
scope
.
FindVar
(
in_var_name
);
auto
&
in_var_tensor
=
in_var
->
Get
<
framework
::
LoDTensor
>
();
...
...
paddle/fluid/operators/save_op.cc
浏览文件 @
d231e550
...
...
@@ -110,7 +110,7 @@ class SaveOp : public framework::OperatorBase {
lt_var
!=
nullptr
,
"Can not find variable kLookupTablePath for SaveSelectedRows"
);
std
::
string
filename
=
lt_var
->
data
();
VLOG
(
4
)
<<
"SaveSelectedRows get File name: "
<<
filename
;
VLOG
(
4
0
)
<<
"SaveSelectedRows get File name: "
<<
filename
;
MkDirRecursively
(
DirName
(
filename
).
c_str
());
...
...
paddle/fluid/operators/send_barrier_op.cc
浏览文件 @
d231e550
...
...
@@ -42,12 +42,12 @@ class SendBarrierOp : public framework::OperatorBase {
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
(
Attr
<
int
>
(
"trainer_id"
));
VLOG
(
3
)
<<
"SendBarrierOp sync"
;
VLOG
(
3
0
)
<<
"SendBarrierOp sync"
;
// need to wait before sending send_barrier message
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
for
(
auto
&
ep
:
eps
)
{
VLOG
(
3
)
<<
"send barrier, ep: "
<<
ep
;
VLOG
(
3
0
)
<<
"send barrier, ep: "
<<
ep
;
rpc_client
->
AsyncSendBatchBarrier
(
ep
);
}
PADDLE_ENFORCE
(
rpc_client
->
Wait
(),
"internal error in RPCClient"
);
...
...
paddle/fluid/operators/send_op.cc
浏览文件 @
d231e550
...
...
@@ -50,10 +50,10 @@ class SendOp : public framework::OperatorBase {
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
i
++
)
{
if
(
NeedSend
(
scope
,
ins
[
i
]))
{
VLOG
(
3
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
];
VLOG
(
3
0
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
];
rets
.
push_back
(
rpc_client
->
AsyncSendVar
(
epmap
[
i
],
ctx
,
scope
,
ins
[
i
]));
}
else
{
VLOG
(
3
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
VLOG
(
3
0
)
<<
"don't send no-initialied variable: "
<<
ins
[
i
];
}
}
if
(
sync_send
)
{
...
...
paddle/fluid/operators/send_recv_op_test.cc
浏览文件 @
d231e550
...
...
@@ -120,7 +120,7 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
void
StartServerNet
(
bool
is_sparse
,
std
::
atomic
<
bool
>
*
initialized
)
{
f
::
Scope
scope
;
p
::
CPUPlace
place
;
VLOG
(
4
)
<<
"before init tensor"
;
VLOG
(
4
0
)
<<
"before init tensor"
;
if
(
is_sparse
)
{
InitSelectedRowsInScope
(
place
,
&
scope
);
}
else
{
...
...
@@ -146,7 +146,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
attrs
.
insert
({
"PrefetchBlock"
,
prefetch_block
});
attrs
.
insert
({
"grad_to_block_id"
,
std
::
vector
<
std
::
string
>
({
""
})});
attrs
.
insert
({
"sync_mode"
,
true
});
VLOG
(
4
)
<<
"before init op"
;
VLOG
(
4
0
)
<<
"before init op"
;
listen_and_serv_op
=
f
::
OpRegistry
::
CreateOp
(
"listen_and_serv"
,
{{
"X"
,
{
"x1"
}}},
{},
attrs
);
*
initialized
=
true
;
...
...
paddle/fluid/operators/sequence_mask_op.h
浏览文件 @
d231e550
...
...
@@ -127,7 +127,7 @@ class SequenceMaskKernel : public framework::OpKernel<Tx> {
auto
x_numel
=
x
->
numel
();
if
(
maxlen
<
0
)
{
#ifdef __NVCC__
VLOG
(
10
)
VLOG
(
10
0
)
<<
"SequenceMaskOp on GPU may be slow when maxlen is not provided."
;
maxlen
=
static_cast
<
int
>
(
thrust
::
reduce
(
thrust
::
device_pointer_cast
(
x_data
),
...
...
paddle/fluid/operators/sgd_op.h
浏览文件 @
d231e550
...
...
@@ -98,10 +98,10 @@ class SGDOpKernel : public framework::OpKernel<T> {
auto
param_row_width
=
param
.
value
().
dims
()[
1
];
auto
grad_row_width
=
grad
.
value
().
dims
()[
1
];
VLOG
(
4
)
<<
" param rows: "
<<
param
.
rows
().
size
()
<<
" param memory rows: "
<<
param
.
value
().
dims
()[
0
]
<<
" grad rows: "
<<
grad
.
rows
().
size
()
<<
" grad memory rows: "
<<
grad
.
value
().
dims
()[
0
];
VLOG
(
4
0
)
<<
" param rows: "
<<
param
.
rows
().
size
()
<<
" param memory rows: "
<<
param
.
value
().
dims
()[
0
]
<<
" grad rows: "
<<
grad
.
rows
().
size
()
<<
" grad memory rows: "
<<
grad
.
value
().
dims
()[
0
];
PADDLE_ENFORCE_EQ
(
param_row_width
,
grad_row_width
,
"param_row should have the same size with grad_row"
);
...
...
paddle/fluid/operators/similarity_focus_op.cc
0 → 100644
浏览文件 @
d231e550
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/similarity_focus_op.h"
namespace
paddle
{
namespace
operators
{
class
SimilarityFocusOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor, default Tensor<float>), a 4-D tensor with shape,"
" [BatchSize, X, Y, Z]"
);
AddOutput
(
"Out"
,
"(Tensor, default Tensor<float>), the similarity focus mask"
" with the same shape of input X."
);
AddAttr
<
int
>
(
"axis"
,
"(int32), indicating the dimension to be select. It can"
" only be 1, 2, or 3."
);
AddAttr
<
std
::
vector
<
int
>>
(
"indexes"
,
"(std::vector<int32>), indicating the indexes"
" of the selected dimension."
);
AddComment
(
R"DOC(
SimilarityFocus Operator.
Generate a similarity focus mask with the same shape of input using the following method:
1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
to the axis according to the indexes. For example, if axis=1 and indexes=[a],
it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
2. For each index, find the largest numbers in the tensor T, so that the same
row and same column has at most one number(what it means is that if the
largest number has been found in the i-th row and the j-th column, then
the numbers in the i-th row or j-th column will be skipped. And then the
next largest number will be selected from the remaining numbers. Obviously
there will be min(B, C) numbers), and mark the corresponding position of the
3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for
each index.
3. Broadcast the 3-D similarity focus mask to the same shape of input X.
Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
)DOC"
);
}
};
class
SimilarityFocusOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) should be not null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) should be not null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
PADDLE_ENFORCE_EQ
(
x_dims
.
size
(),
4
,
"Input(X)'s rank should be 4."
);
ctx
->
SetOutputDim
(
"Out"
,
x_dims
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
platform
::
CPUPlace
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
similarity_focus
,
ops
::
SimilarityFocusOp
,
ops
::
SimilarityFocusOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
similarity_focus
,
ops
::
SimilarityFocusKernel
<
float
>
,
ops
::
SimilarityFocusKernel
<
double
>
);
paddle/fluid/operators/similarity_focus_op.h
0 → 100644
浏览文件 @
d231e550
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <cstring>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
>
class
SimilarityFocusKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
Tensor
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
const
Tensor
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
T
*
out_data
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
T
*
x_data
=
x
->
data
<
T
>
();
int
axis
=
context
.
Attr
<
int
>
(
"axis"
);
std
::
vector
<
int
>
indexes
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"indexes"
);
int64_t
batch_size
=
x
->
dims
()[
0
];
int64_t
dim
[
4
];
for
(
int
i
=
1
;
i
<=
3
;
++
i
)
{
dim
[
i
]
=
x
->
dims
()[
i
];
}
if
(
indexes
.
size
()
<
1
)
{
PADDLE_THROW
(
"Indexes' size can not be 0."
);
}
for
(
auto
index
:
indexes
)
{
if
(
dim
[
axis
]
<
index
)
{
PADDLE_THROW
(
"Index exceeds tensor shape limit."
);
}
}
int64_t
array_size
=
1
;
for
(
int
i
=
1
;
i
<=
3
;
++
i
)
{
if
(
i
!=
axis
)
{
array_size
*=
dim
[
i
];
}
}
std
::
vector
<
std
::
pair
<
T
,
int64_t
>>
array
(
array_size
);
bool
(
*
cmp
)(
std
::
pair
<
T
,
int64_t
>
,
std
::
pair
<
T
,
int64_t
>
)
=
[](
std
::
pair
<
T
,
int64_t
>
x
,
std
::
pair
<
T
,
int64_t
>
y
)
{
return
x
.
first
>
y
.
first
;
};
int64_t
(
*
compute_index
)(
int64_t
*
,
int
,
int
,
int
,
int
)
=
[](
int64_t
*
dim
,
int
d1
,
int
d2
,
int
d3
,
int
d4
)
{
return
d1
*
dim
[
1
]
*
dim
[
2
]
*
dim
[
3
]
+
d2
*
dim
[
2
]
*
dim
[
3
]
+
d3
*
dim
[
3
]
+
d4
;
};
memset
(
out_data
,
0
,
sizeof
(
T
)
*
batch_size
*
dim
[
1
]
*
dim
[
2
]
*
dim
[
3
]);
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
for
(
auto
index
:
indexes
)
{
if
(
axis
==
1
)
{
for
(
int
j
=
0
;
j
<
dim
[
2
];
++
j
)
{
for
(
int
k
=
0
;
k
<
dim
[
3
];
++
k
)
{
array
[
j
*
dim
[
3
]
+
k
]
=
std
::
make_pair
(
x_data
[
compute_index
(
dim
,
i
,
index
,
j
,
k
)],
j
*
dim
[
3
]
+
k
);
}
}
std
::
sort
(
array
.
begin
(),
array
.
end
(),
cmp
);
int
tag_num
=
0
;
std
::
vector
<
bool
>
tag2
(
dim
[
2
]),
tag3
(
dim
[
3
]);
for
(
auto
x
:
array
)
{
int
idx2
=
x
.
second
/
dim
[
3
];
int
idx3
=
x
.
second
%
dim
[
3
];
if
(
tag2
[
idx2
]
||
tag3
[
idx3
])
{
continue
;
}
tag_num
++
;
tag2
[
idx2
]
=
true
;
tag3
[
idx3
]
=
true
;
for
(
int
j
=
0
;
j
<
dim
[
1
];
++
j
)
{
out_data
[
compute_index
(
dim
,
i
,
j
,
idx2
,
idx3
)]
=
1
;
}
if
(
tag_num
==
std
::
min
(
dim
[
2
],
dim
[
3
]))
{
break
;
}
}
}
else
if
(
axis
==
2
)
{
for
(
int
j
=
0
;
j
<
dim
[
1
];
++
j
)
{
for
(
int
k
=
0
;
k
<
dim
[
3
];
++
k
)
{
array
[
j
*
dim
[
3
]
+
k
]
=
std
::
make_pair
(
x_data
[
compute_index
(
dim
,
i
,
j
,
index
,
k
)],
j
*
dim
[
3
]
+
k
);
}
}
std
::
sort
(
array
.
begin
(),
array
.
end
(),
cmp
);
int
tag_num
=
0
;
std
::
vector
<
bool
>
tag1
(
dim
[
1
]),
tag3
(
dim
[
3
]);
for
(
auto
x
:
array
)
{
int
idx1
=
x
.
second
/
dim
[
3
];
int
idx3
=
x
.
second
%
dim
[
3
];
if
(
tag1
[
idx1
]
||
tag3
[
idx3
])
{
continue
;
}
tag_num
++
;
tag1
[
idx1
]
=
true
;
tag3
[
idx3
]
=
true
;
for
(
int
j
=
0
;
j
<
dim
[
2
];
++
j
)
{
out_data
[
compute_index
(
dim
,
i
,
idx1
,
j
,
idx3
)]
=
1
;
}
if
(
tag_num
==
std
::
min
(
dim
[
1
],
dim
[
3
]))
{
break
;
}
}
}
else
if
(
axis
==
3
)
{
for
(
int
j
=
0
;
j
<
dim
[
1
];
++
j
)
{
for
(
int
k
=
0
;
k
<
dim
[
2
];
++
k
)
{
array
[
j
*
dim
[
2
]
+
k
]
=
std
::
make_pair
(
x_data
[
compute_index
(
dim
,
i
,
j
,
k
,
index
)],
j
*
dim
[
2
]
+
k
);
}
}
std
::
sort
(
array
.
begin
(),
array
.
end
(),
cmp
);
int
tag_num
=
0
;
std
::
vector
<
bool
>
tag1
(
dim
[
1
]),
tag2
(
dim
[
2
]);
for
(
auto
x
:
array
)
{
int
idx1
=
x
.
second
/
dim
[
2
];
int
idx2
=
x
.
second
%
dim
[
2
];
if
(
tag1
[
idx1
]
||
tag2
[
idx2
])
{
continue
;
}
tag_num
++
;
tag1
[
idx1
]
=
true
;
tag2
[
idx2
]
=
true
;
for
(
int
j
=
0
;
j
<
dim
[
3
];
++
j
)
{
out_data
[
compute_index
(
dim
,
i
,
idx1
,
idx2
,
j
)]
=
1
;
}
if
(
tag_num
==
std
::
min
(
dim
[
1
],
dim
[
2
]))
{
break
;
}
}
}
else
{
PADDLE_THROW
(
"Axis must be 1 or 2 or 3"
);
}
}
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/softmax_op.cc
浏览文件 @
d231e550
...
...
@@ -124,6 +124,14 @@ For each row $i$ and each column $j$ in the matrix, we have:
}
};
class
SoftmaxOpInferVarType
:
public
framework
::
PassInDtypeAndVarTypeToOutput
{
protected:
std
::
unordered_map
<
std
::
string
,
std
::
string
>
GetInputOutputWithSameType
()
const
override
{
return
std
::
unordered_map
<
std
::
string
,
std
::
string
>
{{
"X"
,
/*->*/
"Out"
}};
}
};
class
SoftmaxOpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
...
...
@@ -196,7 +204,7 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker {
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
softmax
,
ops
::
SoftmaxOp
,
ops
::
SoftmaxOpMaker
,
ops
::
SoftmaxOpGradMaker
);
ops
::
SoftmaxOp
InferVarType
,
ops
::
SoftmaxOp
GradMaker
);
REGISTER_OPERATOR
(
softmax_grad
,
ops
::
SoftmaxOpGrad
);
REGISTER_OP_CPU_KERNEL
(
softmax
,
ops
::
SoftmaxKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/split_byref_op.h
浏览文件 @
d231e550
...
...
@@ -32,7 +32,7 @@ class SplitByrefOpKernel : public framework::OpKernel<T> {
for
(
size_t
i
=
0
;
i
<
outs
.
size
();
++
i
)
{
// NOTE: no need to call mutable_data here to allocate memory.
auto
*
out
=
outs
[
i
];
VLOG
(
3
)
<<
"spliting by ref: "
<<
row_offset
<<
" "
<<
out
->
dims
()[
0
];
VLOG
(
3
0
)
<<
"spliting by ref: "
<<
row_offset
<<
" "
<<
out
->
dims
()[
0
];
*
out
=
in
->
Slice
(
row_offset
,
row_offset
+
out
->
dims
()[
0
]);
row_offset
+=
out
->
dims
()[
0
];
}
...
...
paddle/fluid/operators/split_ids_op.h
浏览文件 @
d231e550
...
...
@@ -44,7 +44,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
for
(
size_t
i
=
0
;
i
<
ids_tensors
.
size
();
++
i
)
{
batch_size
+=
ids_tensors
[
i
]
->
dims
()[
0
];
}
VLOG
(
4
)
<<
"Get Total BatchSize is: "
<<
batch_size
;
VLOG
(
4
0
)
<<
"Get Total BatchSize is: "
<<
batch_size
;
std
::
vector
<
T
>
all_ids
(
batch_size
);
int
offset
=
0
;
...
...
paddle/fluid/operators/sum_mkldnn_op.cc
浏览文件 @
d231e550
...
...
@@ -186,7 +186,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
}
if
(
in_dim
.
empty
())
{
VLOG
(
3
)
<<
"WARNING: all the inputs are empty"
;
VLOG
(
3
0
)
<<
"WARNING: all the inputs are empty"
;
in_dim
=
framework
::
vectorize
(
get_selected_row
(
N
-
1
).
value
().
dims
());
}
else
{
in_dim
[
0
]
=
static_cast
<
int64_t
>
(
first_dim
);
...
...
paddle/fluid/operators/sum_op.cc
浏览文件 @
d231e550
...
...
@@ -45,7 +45,7 @@ class SumOp : public framework::OperatorWithKernel {
size_t
N
=
x_dims
.
size
();
PADDLE_ENFORCE_GT
(
N
,
0
,
"Input tensors count should > 0."
);
if
(
N
==
1
)
{
VLOG
(
3
)
<<
"Warning: sum have only one input, may waste memory"
;
VLOG
(
3
0
)
<<
"Warning: sum have only one input, may waste memory"
;
}
framework
::
DDim
in_dim
({
0
});
...
...
@@ -157,8 +157,8 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
auto
&
inputs
=
op_desc
.
Input
(
"X"
);
auto
var_type
=
framework
::
proto
::
VarType
::
SELECTED_ROWS
;
for
(
auto
&
name
:
op_desc
.
Input
(
"X"
))
{
VLOG
(
10
)
<<
name
<<
" "
<<
block
->
FindRecursiveOrCreateVar
(
name
).
GetType
();
VLOG
(
10
0
)
<<
name
<<
" "
<<
block
->
FindRecursiveOrCreateVar
(
name
).
GetType
();
}
bool
any_input_is_lod_tensor
=
std
::
any_of
(
...
...
paddle/fluid/operators/tensor_array_read_write_op.cc
浏览文件 @
d231e550
...
...
@@ -34,8 +34,8 @@ class WriteToArrayOp : public ArrayOp {
auto
*
out
=
scope
.
FindVar
(
Output
(
"Out"
))
->
GetMutable
<
framework
::
LoDTensorArray
>
();
if
(
offset
>=
out
->
size
())
{
VLOG
(
10
)
<<
"Resize "
<<
Output
(
"Out"
)
<<
" from "
<<
out
->
size
()
<<
" to "
<<
offset
+
1
;
VLOG
(
10
0
)
<<
"Resize "
<<
Output
(
"Out"
)
<<
" from "
<<
out
->
size
()
<<
" to "
<<
offset
+
1
;
out
->
resize
(
offset
+
1
);
}
auto
*
out_tensor
=
&
out
->
at
(
offset
);
...
...
@@ -47,9 +47,9 @@ class WriteToArrayOp : public ArrayOp {
TensorCopy
(
x_tensor
,
place
,
dev_ctx
,
out_tensor
);
}
else
{
VLOG
(
10
)
<<
"WARNING: The input tensor 'x_tensor' holds no memory, so "
"nothing has been written to output array["
<<
offset
<<
"]."
;
VLOG
(
10
0
)
<<
"WARNING: The input tensor 'x_tensor' holds no memory, so "
"nothing has been written to output array["
<<
offset
<<
"]."
;
}
}
};
...
...
@@ -104,7 +104,7 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
framework
::
BlockDesc
*
block
)
const
override
{
auto
x_name
=
op_desc
.
Input
(
"X"
)[
0
];
auto
out_name
=
op_desc
.
Output
(
"Out"
)[
0
];
VLOG
(
10
)
<<
"Set Variable "
<<
out_name
<<
" as LOD_TENSOR_ARRAY"
;
VLOG
(
10
0
)
<<
"Set Variable "
<<
out_name
<<
" as LOD_TENSOR_ARRAY"
;
auto
&
out
=
block
->
FindRecursiveOrCreateVar
(
out_name
);
out
.
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR_ARRAY
);
auto
*
x
=
block
->
FindVarRecursive
(
x_name
);
...
...
@@ -139,7 +139,7 @@ class ReadFromArrayOp : public ArrayOp {
framework
::
TensorCopy
(
x_array
[
offset
],
place
,
dev_ctx
,
out_tensor
);
out_tensor
->
set_lod
(
x_array
[
offset
].
lod
());
}
else
{
VLOG
(
10
)
<<
"offset "
<<
offset
<<
" >= "
<<
x_array
.
size
();
VLOG
(
10
0
)
<<
"offset "
<<
offset
<<
" >= "
<<
x_array
.
size
();
}
}
};
...
...
paddle/fluid/operators/tensor_array_to_tensor_op.cc
0 → 100644
浏览文件 @
d231e550
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable.h"
namespace
paddle
{
namespace
operators
{
using
framework
::
Tensor
;
void
LodTensorArray2LodTensorVector
(
const
framework
::
Scope
&
scope
,
const
std
::
string
&
base_name
,
const
std
::
string
&
lod_tensor_array_name
,
std
::
vector
<
std
::
string
>
*
res_names
)
{
auto
&
inx
=
scope
.
FindVar
(
lod_tensor_array_name
)
->
Get
<
framework
::
LoDTensorArray
>
();
for
(
size_t
i
=
0
;
i
<
inx
.
size
();
i
++
)
{
std
::
string
var_name
=
base_name
+
std
::
to_string
(
i
);
framework
::
Variable
*
g_feed_value
=
const_cast
<
framework
::
Scope
&>
(
scope
).
Var
(
var_name
);
auto
&
feed_input
=
*
(
g_feed_value
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
());
feed_input
.
ShareDataWith
(
inx
[
i
]);
res_names
->
push_back
(
var_name
);
}
}
void
LodTensorVectorResizeFromLodTensorArray
(
const
framework
::
Scope
&
scope
,
const
std
::
string
&
base_name
,
const
std
::
string
&
lod_tensor_array_name
,
std
::
vector
<
std
::
string
>
*
res_names
)
{
auto
&
inx
=
scope
.
FindVar
(
lod_tensor_array_name
)
->
Get
<
framework
::
LoDTensorArray
>
();
for
(
size_t
i
=
0
;
i
<
inx
.
size
();
i
++
)
{
std
::
string
var_name
=
base_name
+
std
::
to_string
(
i
);
framework
::
Variable
*
g_feed_value
=
const_cast
<
framework
::
Scope
&>
(
scope
).
Var
(
var_name
);
auto
&
feed_input
=
*
(
g_feed_value
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
());
auto
dims
=
inx
[
i
].
dims
();
feed_input
.
Resize
(
dims
);
res_names
->
push_back
(
var_name
);
}
}
void
LodTensorArrayCreateFromLodTensorArray
(
const
framework
::
Scope
&
scope
,
const
std
::
string
&
input_lod_tensor_array_name
,
const
std
::
string
&
output_lod_tensor_array_name
)
{
auto
&
inx
=
scope
.
FindVar
(
input_lod_tensor_array_name
)
->
Get
<
framework
::
LoDTensorArray
>
();
auto
&
grad_inx
=
*
scope
.
FindVar
(
output_lod_tensor_array_name
)
->
GetMutable
<
framework
::
LoDTensorArray
>
();
for
(
size_t
i
=
0
;
i
<
inx
.
size
();
i
++
)
{
std
::
string
var_name
=
output_lod_tensor_array_name
+
std
::
to_string
(
i
);
framework
::
Variable
*
g_feed_value
=
const_cast
<
framework
::
Scope
&>
(
scope
).
Var
(
var_name
);
auto
&
feed_input
=
*
(
g_feed_value
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
());
grad_inx
.
push_back
(
feed_input
);
}
}
class
LoDTensorArray2TensorOp
:
public
framework
::
OperatorBase
{
public:
using
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{
auto
axis
=
Attr
<
int
>
(
"axis"
);
framework
::
AttributeMap
attrs
;
attrs
[
"axis"
]
=
axis
;
auto
&
inx
=
scope
.
FindVar
(
Input
(
"X"
))
->
Get
<
framework
::
LoDTensorArray
>
();
auto
&
out
=
*
scope
.
FindVar
(
Output
(
"Out"
))
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
&
out_inx
=
*
scope
.
FindVar
(
Output
(
"OutIndex"
))
->
GetMutable
<
framework
::
LoDTensor
>
();
const
size_t
n
=
inx
.
size
();
PADDLE_ENFORCE_GT
(
n
,
0
,
"Input tensorarray size should > 0."
);
std
::
string
base_name
=
Inputs
(
"X"
)[
0
];
std
::
vector
<
std
::
string
>
names
;
// get the input tensorarray items' dim in out_inx
auto
out_inx_dim
=
out_inx
.
dims
();
out_inx_dim
[
0
]
=
inx
.
size
();
out_inx
.
Resize
(
out_inx_dim
);
std
::
string
var_name
=
"out_index"
;
framework
::
Variable
*
tmp_index_var
=
const_cast
<
framework
::
Scope
&>
(
scope
).
Var
(
var_name
);
auto
&
tmp_index_tensor
=
*
(
tmp_index_var
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
());
tmp_index_tensor
.
Resize
(
out_inx_dim
);
int
*
tmp_index_data
=
tmp_index_tensor
.
mutable_data
<
int
>
(
platform
::
CPUPlace
());
auto
out_dims
=
inx
[
0
].
dims
();
size_t
out_dim_sum
=
0
;
for
(
size_t
index
=
0
;
index
<
inx
.
size
();
index
++
)
{
auto
inx_dims
=
inx
[
index
].
dims
();
out_dim_sum
+=
inx_dims
[
axis
];
tmp_index_data
[
index
]
=
inx_dims
[
axis
];
}
out_inx
.
ShareDataWith
(
tmp_index_tensor
);
// get input array items' dims
out_dims
[
axis
]
=
out_dim_sum
;
out
.
Resize
(
out_dims
);
LodTensorArray2LodTensorVector
(
scope
,
base_name
,
Input
(
"X"
),
&
names
);
// Invoke Reshape Op
auto
concat_op
=
framework
::
OpRegistry
::
CreateOp
(
"concat"
,
{{
"X"
,
names
}},
{{
"Out"
,
{
Output
(
"Out"
)}}},
attrs
);
concat_op
->
Run
(
scope
,
place
);
}
};
class
LoDTensorArray2TensorOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"Input LoDTensorArray of tensor_array_to_tensor operator."
);
AddOutput
(
"Out"
,
"Output tensor of tensor_array_to_tensor operator."
);
AddOutput
(
"OutIndex"
,
"Output input LoDTensorArray items' dims of "
"tensor_array_to_tensor operator."
);
AddAttr
<
int
>
(
"axis"
,
"The axis along which the input tensors will be concatenated."
)
.
SetDefault
(
0
);
AddComment
(
R"DOC(
tensor_array_to_tensor Operator.
Concatenate the input LoDTensorArray along dimension axis to the output Tensor.
Examples:
Input = {[1,2], [3,4], [5,6]}
axis = 0
Output = [[1,2],
[3,4],
[5,6]]
OutputIndex = [1,1,1]
)DOC"
);
}
};
class
LoDTensorArray2TensorOpInferShape
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{}
};
class
LoDTensorArray2TensorGradInferShape
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
context
)
const
override
{}
};
class
LoDTensorArray2TensorGradInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
for
(
auto
&
out_var
:
op_desc
.
Output
(
framework
::
GradVarName
(
"X"
)))
{
block
->
Var
(
out_var
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR_ARRAY
);
}
}
};
class
LoDTensorArray2TensorGradOp
:
public
framework
::
OperatorBase
{
public:
using
OperatorBase
::
OperatorBase
;
private:
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
)
const
override
{
auto
axis
=
Attr
<
int
>
(
"axis"
);
framework
::
AttributeMap
attrs
;
attrs
[
"axis"
]
=
axis
;
auto
&
inx
=
scope
.
FindVar
(
Input
(
"X"
))
->
Get
<
framework
::
LoDTensorArray
>
();
const
size_t
n
=
inx
.
size
();
PADDLE_ENFORCE_GT
(
n
,
0
,
"Input tensorarray size should > 0."
);
std
::
string
base_name
=
Inputs
(
"X"
)[
0
];
std
::
vector
<
std
::
string
>
names
;
LodTensorArray2LodTensorVector
(
scope
,
base_name
,
Input
(
"X"
),
&
names
);
// grad
auto
dx_name
=
Output
(
framework
::
GradVarName
(
"X"
));
auto
dout_name
=
Input
(
framework
::
GradVarName
(
"Out"
));
std
::
vector
<
std
::
string
>
grad_names
;
LodTensorVectorResizeFromLodTensorArray
(
scope
,
"grad_name"
,
Input
(
"X"
),
&
grad_names
);
auto
concat_grad_op
=
framework
::
OpRegistry
::
CreateOp
(
"concat_grad"
,
{{
"X"
,
names
},
{
"Out@GRAD"
,
{
dout_name
}}},
{{
"X@GRAD"
,
grad_names
}},
attrs
);
concat_grad_op
->
Run
(
scope
,
place
);
LodTensorArrayCreateFromLodTensorArray
(
scope
,
Input
(
"X"
),
dx_name
);
auto
&
grad_inx
=
*
scope
.
FindVar
(
dx_name
)
->
GetMutable
<
framework
::
LoDTensorArray
>
();
for
(
size_t
i
=
0
;
i
<
grad_names
.
size
();
i
++
)
{
std
::
string
var_name
=
grad_names
[
i
];
auto
&
feed_input
=
scope
.
FindVar
(
var_name
)
->
Get
<
framework
::
LoDTensor
>
();
grad_inx
[
i
].
ShareDataWith
(
feed_input
);
}
}
};
}
// namespace operators
}
// namespace paddle
USE_OP
(
concat
);
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
tensor_array_to_tensor
,
ops
::
LoDTensorArray2TensorOp
,
ops
::
LoDTensorArray2TensorOpMaker
,
ops
::
LoDTensorArray2TensorOpInferShape
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
tensor_array_to_tensor_grad
,
ops
::
LoDTensorArray2TensorGradOp
,
ops
::
LoDTensorArray2TensorGradInferShape
,
ops
::
LoDTensorArray2TensorGradInferVarType
);
paddle/fluid/operators/tensorrt_engine_op.h
浏览文件 @
d231e550
...
...
@@ -34,7 +34,7 @@ namespace operators {
using
FluidDT
=
framework
::
proto
::
VarType_Type
;
using
TRT_DT
=
nvinfer1
::
DataType
;
namespace
{
namespace
{
// NOLINT
TRT_DT
FluidDataType2TRT
(
FluidDT
type
)
{
switch
(
type
)
{
...
...
@@ -60,7 +60,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
return
nvinfer1
::
DimsCHW
(
shape
[
1
],
1
,
1
);
}
}
// namespace
}
// namespace
// NOLINT
using
inference
::
Singleton
;
using
inference
::
tensorrt
::
TRT_EngineManager
;
...
...
@@ -127,9 +127,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// Convert output tensor from engine to fluid
int
output_index
=
0
;
VLOG
(
4
)
<<
"TensorRT Engine Op Outputs:"
;
VLOG
(
4
0
)
<<
"TensorRT Engine Op Outputs:"
;
for
(
const
auto
&
y
:
context
.
Outputs
(
"Ys"
))
{
VLOG
(
4
)
<<
y
;
VLOG
(
4
0
)
<<
y
;
// convert output and copy to fluid.
nvinfer1
::
ITensor
*
trt_t
=
engine
->
GetITensor
(
output_maps
[
output_index
]);
auto
dims
=
trt_t
->
getDimensions
();
...
...
@@ -167,7 +167,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
protected:
void
Prepare
(
const
framework
::
ExecutionContext
&
context
)
const
{
VLOG
(
4
)
<<
"Prepare engine"
;
VLOG
(
4
0
)
<<
"Prepare engine"
;
// Get the ProgramDesc and pass to convert.
framework
::
proto
::
BlockDesc
block_desc
;
block_desc
.
ParseFromString
(
context
.
Attr
<
std
::
string
>
(
"subgraph"
));
...
...
@@ -192,12 +192,12 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
engine
->
InitNetwork
();
framework
::
BlockDesc
block
(
nullptr
/*programdesc*/
,
&
block_desc
);
VLOG
(
4
)
<<
"parsed var size "
<<
block
.
AllVars
().
size
();
VLOG
(
4
0
)
<<
"parsed var size "
<<
block
.
AllVars
().
size
();
// Add inputs
VLOG
(
4
)
<<
"declare inputs"
;
VLOG
(
4
0
)
<<
"declare inputs"
;
for
(
auto
&
input
:
context
.
Inputs
(
"Xs"
))
{
if
(
parameters
.
count
(
input
))
continue
;
VLOG
(
4
)
<<
"declare input "
<<
input
;
VLOG
(
4
0
)
<<
"declare input "
<<
input
;
auto
*
var
=
block
.
FindVar
(
input
);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
...
...
paddle/fluid/operators/while_op.cc
浏览文件 @
d231e550
...
...
@@ -129,15 +129,15 @@ class WhileGradOp : public framework::OperatorBase {
for
(
auto
cur_scope_iter
=
step_scopes
->
rbegin
();
cur_scope_iter
!=
step_scopes
->
rend
();
++
cur_scope_iter
)
{
VLOG
(
3
)
<<
"Start backward at time_step "
<<
cur_scope_iter
-
step_scopes
->
rbegin
();
VLOG
(
3
0
)
<<
"Start backward at time_step "
<<
cur_scope_iter
-
step_scopes
->
rbegin
();
framework
::
Scope
&
cur_scope
=
**
cur_scope_iter
;
// Link OG from outside to inside
for
(
size_t
i
=
0
;
i
<
outside_og_names
.
size
();
++
i
)
{
auto
outside_og_name
=
outside_og_names
[
i
];
auto
inside_og_name
=
inside_og_names
[
i
];
VLOG
(
8
)
<<
"Linking outside "
<<
outside_og_name
<<
" --> inside "
<<
inside_og_name
;
VLOG
(
8
0
)
<<
"Linking outside "
<<
outside_og_name
<<
" --> inside "
<<
inside_og_name
;
if
(
scope
.
FindVar
(
outside_og_name
)
==
nullptr
)
{
continue
;
}
...
...
@@ -159,11 +159,11 @@ class WhileGradOp : public framework::OperatorBase {
auto
&
outside_array
=
og_outside
.
Get
<
framework
::
LoDTensorArray
>
();
auto
&
inside_array
=
detail
::
Ref
(
og_inside
.
GetMutable
<
framework
::
LoDTensorArray
>
());
VLOG
(
8
)
<<
outside_og_name
<<
" size = "
<<
outside_array
.
size
();
VLOG
(
8
0
)
<<
outside_og_name
<<
" size = "
<<
outside_array
.
size
();
inside_array
.
resize
(
outside_array
.
size
());
for
(
size_t
j
=
0
;
j
<
inside_array
.
size
();
++
j
)
{
VLOG
(
8
)
<<
j
<<
" "
<<
outside_array
[
j
].
numel
();
VLOG
(
8
0
)
<<
j
<<
" "
<<
outside_array
[
j
].
numel
();
if
(
outside_array
[
j
].
numel
()
!=
0
)
{
inside_array
[
j
].
set_lod
(
outside_array
[
j
].
lod
());
inside_array
[
j
].
ShareDataWith
(
outside_array
[
j
]);
...
...
@@ -289,7 +289,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
auto
igs
=
InputGrad
(
kX
,
/*do not drop empty gradient*/
false
);
for
(
auto
&
each_ig
:
igs
)
{
if
(
inner_op_outputs
.
find
(
each_ig
)
==
inner_op_outputs
.
end
())
{
VLOG
(
8
)
<<
"Ignore "
<<
each_ig
;
VLOG
(
8
0
)
<<
"Ignore "
<<
each_ig
;
each_ig
=
framework
::
kEmptyVarName
;
}
}
...
...
@@ -353,8 +353,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
auto
&
p_var
=
detail
::
Ref
(
block
->
FindVarRecursive
(
p_names
[
i
]));
auto
*
g_var
=
block
->
FindVarRecursive
(
pg_ig_names
[
i
]);
if
(
g_var
!=
nullptr
)
{
// Gradient could be @EMPTY@
VLOG
(
5
)
<<
"Setting "
<<
pg_ig_names
[
i
]
<<
" following "
<<
p_names
[
i
]
<<
" type: "
<<
p_var
.
GetType
();
VLOG
(
5
0
)
<<
"Setting "
<<
pg_ig_names
[
i
]
<<
" following "
<<
p_names
[
i
]
<<
" type: "
<<
p_var
.
GetType
();
g_var
->
SetType
(
p_var
.
GetType
());
g_var
->
SetDataType
(
p_var
.
GetDataType
());
}
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
d231e550
...
...
@@ -207,7 +207,10 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
<<
"."
<<
(
driver_version_
%
100
)
/
10
<<
", Runtime Version: "
<<
runtime_version_
/
1000
<<
"."
<<
(
runtime_version_
%
100
)
/
10
;
size_t
cudnn_dso_ver
=
dynload
::
cudnnGetVersion
();
LOG_FIRST_N
(
WARNING
,
1
)
<<
"device: "
<<
place_
.
device
<<
", cuDNN Version: "
<<
cudnn_dso_ver
/
1000
<<
"."
<<
(
cudnn_dso_ver
%
100
)
/
10
<<
"."
;
callback_manager_
.
reset
(
new
StreamCallbackManager
(
stream_
));
}
...
...
paddle/fluid/platform/device_tracer.cc
浏览文件 @
d231e550
...
...
@@ -203,7 +203,7 @@ class DeviceTracerImpl : public DeviceTracer {
void
AddCPURecords
(
const
std
::
string
&
anno
,
uint64_t
start_ns
,
uint64_t
end_ns
,
int64_t
device_id
,
int64_t
thread_id
)
{
if
(
anno
.
empty
())
{
VLOG
(
1
)
<<
"Empty timeline annotation."
;
VLOG
(
1
0
)
<<
"Empty timeline annotation."
;
return
;
}
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
...
...
@@ -216,7 +216,7 @@ class DeviceTracerImpl : public DeviceTracer {
uint32_t
correlation_id
,
uint64_t
bytes
)
{
// 0 means timestamp information could not be collected for the kernel.
if
(
start_ns
==
0
||
end_ns
==
0
)
{
VLOG
(
3
)
<<
name
<<
" cannot be traced"
;
VLOG
(
3
0
)
<<
name
<<
" cannot be traced"
;
return
;
}
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
...
...
@@ -228,7 +228,7 @@ class DeviceTracerImpl : public DeviceTracer {
int64_t
stream_id
,
uint32_t
correlation_id
)
{
// 0 means timestamp information could not be collected for the kernel.
if
(
start
==
0
||
end
==
0
)
{
VLOG
(
3
)
<<
correlation_id
<<
" cannot be traced"
;
VLOG
(
3
0
)
<<
correlation_id
<<
" cannot be traced"
;
return
;
}
std
::
lock_guard
<
std
::
mutex
>
l
(
trace_mu_
);
...
...
@@ -347,7 +347,7 @@ class DeviceTracerImpl : public DeviceTracer {
tracer
->
AddAnnotation
(
cbInfo
->
correlationId
,
anno
);
}
}
else
{
VLOG
(
1
)
<<
"Unhandled API Callback for "
<<
domain
<<
" "
<<
cbid
;
VLOG
(
1
0
)
<<
"Unhandled API Callback for "
<<
domain
<<
" "
<<
cbid
;
}
}
CUpti_SubscriberHandle
subscriber_
;
...
...
paddle/fluid/platform/dynload/cudnn.h
浏览文件 @
d231e550
...
...
@@ -65,51 +65,54 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
* include all needed cudnn functions in HPPL
* different cudnn version has different interfaces
**/
#define CUDNN_DNN_ROUTINE_EACH(__macro) \
__macro(cudnnSetTensor4dDescriptor); \
__macro(cudnnSetTensor4dDescriptorEx); \
__macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnCreateTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnCreateFilterDescriptor); \
__macro(cudnnSetFilter4dDescriptor); \
__macro(cudnnSetFilterNdDescriptor); \
__macro(cudnnGetFilterNdDescriptor); \
__macro(cudnnSetPooling2dDescriptor); \
__macro(cudnnSetPoolingNdDescriptor); \
__macro(cudnnGetPoolingNdDescriptor); \
__macro(cudnnDestroyFilterDescriptor); \
__macro(cudnnCreateConvolutionDescriptor); \
__macro(cudnnCreatePoolingDescriptor); \
__macro(cudnnDestroyPoolingDescriptor); \
__macro(cudnnSetConvolution2dDescriptor); \
__macro(cudnnDestroyConvolutionDescriptor); \
__macro(cudnnSetConvolutionNdDescriptor); \
__macro(cudnnGetConvolutionNdDescriptor); \
__macro(cudnnDeriveBNTensorDescriptor); \
__macro(cudnnCreateSpatialTransformerDescriptor); \
__macro(cudnnSetSpatialTransformerNdDescriptor); \
__macro(cudnnDestroySpatialTransformerDescriptor); \
__macro(cudnnSpatialTfGridGeneratorForward); \
__macro(cudnnSpatialTfGridGeneratorBackward); \
__macro(cudnnSpatialTfSamplerForward); \
__macro(cudnnSpatialTfSamplerBackward); \
__macro(cudnnCreate); \
__macro(cudnnDestroy); \
__macro(cudnnSetStream); \
__macro(cudnnActivationForward); \
__macro(cudnnConvolutionForward); \
__macro(cudnnConvolutionBackwardBias); \
__macro(cudnnGetConvolutionForwardWorkspaceSize); \
__macro(cudnnTransformTensor); \
__macro(cudnnPoolingForward); \
__macro(cudnnPoolingBackward); \
__macro(cudnnSoftmaxBackward); \
__macro(cudnnSoftmaxForward); \
__macro(cudnnGetVersion); \
#define CUDNN_DNN_ROUTINE_EACH(__macro) \
__macro(cudnnSetTensor4dDescriptor); \
__macro(cudnnSetTensor4dDescriptorEx); \
__macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnCreateTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnCreateFilterDescriptor); \
__macro(cudnnSetFilter4dDescriptor); \
__macro(cudnnSetFilterNdDescriptor); \
__macro(cudnnGetFilterNdDescriptor); \
__macro(cudnnSetPooling2dDescriptor); \
__macro(cudnnSetPoolingNdDescriptor); \
__macro(cudnnGetPoolingNdDescriptor); \
__macro(cudnnDestroyFilterDescriptor); \
__macro(cudnnCreateConvolutionDescriptor); \
__macro(cudnnCreatePoolingDescriptor); \
__macro(cudnnDestroyPoolingDescriptor); \
__macro(cudnnSetConvolution2dDescriptor); \
__macro(cudnnDestroyConvolutionDescriptor); \
__macro(cudnnSetConvolutionNdDescriptor); \
__macro(cudnnGetConvolutionNdDescriptor); \
__macro(cudnnDeriveBNTensorDescriptor); \
__macro(cudnnCreateSpatialTransformerDescriptor); \
__macro(cudnnSetSpatialTransformerNdDescriptor); \
__macro(cudnnDestroySpatialTransformerDescriptor); \
__macro(cudnnSpatialTfGridGeneratorForward); \
__macro(cudnnSpatialTfGridGeneratorBackward); \
__macro(cudnnSpatialTfSamplerForward); \
__macro(cudnnSpatialTfSamplerBackward); \
__macro(cudnnCreate); \
__macro(cudnnDestroy); \
__macro(cudnnSetStream); \
__macro(cudnnActivationForward); \
__macro(cudnnConvolutionForward); \
__macro(cudnnConvolutionBackwardBias); \
__macro(cudnnGetConvolutionForwardWorkspaceSize); \
__macro(cudnnTransformTensor); \
__macro(cudnnPoolingForward); \
__macro(cudnnPoolingBackward); \
__macro(cudnnSoftmaxBackward); \
__macro(cudnnSoftmaxForward); \
__macro(cudnnGetVersion); \
__macro(cudnnFindConvolutionForwardAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \
__macro(cudnnGetErrorString);
CUDNN_DNN_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
...
...
paddle/fluid/platform/dynload/dynamic_loader.cc
浏览文件 @
d231e550
...
...
@@ -72,8 +72,8 @@ static inline std::string join(const std::string& part1,
static
inline
void
*
GetDsoHandleFromDefaultPath
(
const
std
::
string
&
dso_path
,
int
dynload_flags
)
{
VLOG
(
3
)
<<
"Try to find library: "
<<
dso_path
<<
" from default system path."
;
VLOG
(
3
0
)
<<
"Try to find library: "
<<
dso_path
<<
" from default system path."
;
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
// and /usr/local/lib path
void
*
dso_handle
=
dlopen
(
dso_path
.
c_str
(),
dynload_flags
);
...
...
paddle/fluid/platform/gpu_info.cc
浏览文件 @
d231e550
...
...
@@ -124,8 +124,8 @@ size_t GpuMaxChunkSize() {
size_t
available
=
0
;
GpuMemoryUsage
(
&
available
,
&
total
);
VLOG
(
10
)
<<
"GPU Usage "
<<
available
/
1024
/
1024
<<
"M/"
<<
total
/
1024
/
1024
<<
"M"
;
VLOG
(
10
0
)
<<
"GPU Usage "
<<
available
/
1024
/
1024
<<
"M/"
<<
total
/
1024
/
1024
<<
"M"
;
size_t
reserving
=
static_cast
<
size_t
>
(
0.05
*
total
);
// If available less than minimum chunk size, no usable memory exists.
available
=
...
...
paddle/fluid/platform/init.cc
浏览文件 @
d231e550
...
...
@@ -48,7 +48,7 @@ void InitGflags(std::vector<std::string> argv) {
line
+=
' '
;
}
google
::
ParseCommandLineFlags
(
&
argc
,
&
arr
,
true
);
VLOG
(
1
)
<<
"Init commandline: "
<<
line
;
VLOG
(
1
0
)
<<
"Init commandline: "
<<
line
;
});
}
...
...
paddle/fluid/platform/nccl_helper.h
浏览文件 @
d231e550
...
...
@@ -112,7 +112,7 @@ struct NCCLContextMap {
NCCLGroupGuard
gurad
;
for
(
auto
&
gpu_id
:
order_
)
{
int
rank
=
trainer_id
*
order_
.
size
()
+
gpu_id
;
VLOG
(
3
)
<<
"init nccl rank: "
<<
rank
<<
" nranks: "
<<
nranks
;
VLOG
(
3
0
)
<<
"init nccl rank: "
<<
rank
<<
" nranks: "
<<
nranks
;
PADDLE_ENFORCE
(
cudaSetDevice
(
gpu_id
));
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclCommInitRank
(
comms
.
get
()
+
gpu_id
,
nranks
,
*
nccl_id
,
rank
));
...
...
paddle/fluid/pybind/protobuf.cc
浏览文件 @
d231e550
...
...
@@ -61,9 +61,9 @@ struct variant_caster<V<Ts...>> {
if
(
std
::
is_same
<
T
,
std
::
vector
<
float
>>::
value
)
{
auto
caster_ints
=
make_caster
<
std
::
vector
<
int64_t
>>
();
if
(
caster_ints
.
load
(
src
,
convert
))
{
VLOG
(
4
)
<<
"This value are floats and int64_ts satisfy "
"simultaneously, will set it's type to "
"std::vector<int64_t>"
;
VLOG
(
4
0
)
<<
"This value are floats and int64_ts satisfy "
"simultaneously, will set it's type to "
"std::vector<int64_t>"
;
value
=
cast_op
<
std
::
vector
<
int64_t
>>
(
caster_ints
);
return
true
;
}
...
...
paddle/fluid/train/demo/demo_trainer.cc
浏览文件 @
d231e550
...
...
@@ -40,7 +40,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
std
::
unique_ptr
<
paddle
::
framework
::
ProgramDesc
>
Load
(
paddle
::
framework
::
Executor
*
executor
,
const
std
::
string
&
model_filename
)
{
VLOG
(
3
)
<<
"loading model from "
<<
model_filename
;
VLOG
(
3
0
)
<<
"loading model from "
<<
model_filename
;
std
::
string
program_desc_str
;
ReadBinaryFile
(
model_filename
,
&
program_desc_str
);
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
d231e550
...
...
@@ -614,7 +614,24 @@ EOF
CMD
=
'"true"'
fi
cat
>>
${
PADDLE_ROOT
}
/build/Dockerfile
<<
EOF
if
[
"
$1
"
==
"cp35-cp35m"
]
;
then
cat
>>
${
PADDLE_ROOT
}
/build/Dockerfile
<<
EOF
ADD python/dist/*.whl /
# run paddle version to install python packages first
RUN apt-get update &&
${
NCCL_DEPS
}
RUN apt-get install -y wget python3 python3-pip libgtk2.0-dev dmidecode python3-tk &&
\
pip3 install opencv-python && pip3 install /*.whl; apt-get install -f -y &&
\
apt-get clean -y &&
\
rm -f /*.whl &&
\
${
PADDLE_VERSION
}
&&
\
ldconfig
${
DOCKERFILE_CUDNN_DSO
}
${
DOCKERFILE_CUBLAS_DSO
}
${
DOCKERFILE_GPU_ENV
}
ENV NCCL_LAUNCH_MODE PARALLEL
EOF
else
cat
>>
${
PADDLE_ROOT
}
/build/Dockerfile
<<
EOF
ADD python/dist/*.whl /
# run paddle version to install python packages first
RUN apt-get update &&
${
NCCL_DEPS
}
...
...
@@ -629,6 +646,8 @@ EOF
${
DOCKERFILE_GPU_ENV
}
ENV NCCL_LAUNCH_MODE PARALLEL
EOF
fi
if
[[
${
WITH_GOLANG
:-
OFF
}
==
"ON"
]]
;
then
cat
>>
${
PADDLE_ROOT
}
/build/Dockerfile
<<
EOF
ADD go/cmd/pserver/pserver /usr/bin/
...
...
@@ -703,7 +722,7 @@ function main() {
build
)
cmake_gen
${
PYTHON_ABI
:-
""
}
build
gen_dockerfile
gen_dockerfile
${
PYTHON_ABI
:-
""
}
;;
build_android
)
build_android
...
...
@@ -730,7 +749,7 @@ function main() {
gen_html
;;
dockerfile
)
gen_dockerfile
gen_dockerfile
${
PYTHON_ABI
:-
""
}
;;
capi
)
cmake_gen
${
PYTHON_ABI
:-
""
}
...
...
paddle/testing/TestUtil.cpp
浏览文件 @
d231e550
...
...
@@ -118,7 +118,7 @@ void generateSequenceStartPositions(size_t batchSize,
}
buf
[
i
]
=
pos
;
pos
+=
len
;
VLOG
(
1
)
<<
" len="
<<
len
;
VLOG
(
1
0
)
<<
" len="
<<
len
;
}
buf
[
numSeqs
]
=
batchSize
;
}
...
...
python/paddle/fluid/__init__.py
浏览文件 @
d231e550
...
...
@@ -34,6 +34,7 @@ from . import regularizer
from
.
import
average
from
.
import
metrics
from
.
import
transpiler
from
.
import
distribute_lookup_table
from
.param_attr
import
ParamAttr
,
WeightNormParamAttr
from
.data_feeder
import
DataFeeder
from
.core
import
LoDTensor
,
LoDTensorArray
,
CPUPlace
,
CUDAPlace
,
CUDAPinnedPlace
,
Scope
...
...
@@ -126,7 +127,8 @@ def __bootstrap__():
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
'fraction_of_gpu_memory_to_use'
,
'cudnn_deterministic'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
[
"--tryfromenv="
+
","
.
join
(
read_env_flags
)])
...
...
python/paddle/fluid/distribute_lookup_table.py
0 → 100644
浏览文件 @
d231e550
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
LOOKUP_TABLE_TYPE
=
"lookup_table"
def
find_distributed_lookup_table
(
program
):
"""
Find distribute lookup table in program.
We only support one distribute table now.
:param program:
:return: table_name or None
"""
table_name
=
None
for
op
in
program
.
global_block
().
ops
:
if
op
.
type
==
LOOKUP_TABLE_TYPE
:
if
op
.
attr
(
'is_distributed'
)
is
True
:
if
table_name
is
None
:
table_name
=
op
.
input
(
"W"
)[
0
]
if
table_name
!=
op
.
input
(
"W"
)[
0
]:
raise
RuntimeError
(
"all distributed lookup_table_ops"
" should have only one table"
)
else
:
if
table_name
is
not
None
:
assert
op
.
input
(
"W"
)[
0
]
!=
table_name
return
table_name
python/paddle/fluid/layers/nn.py
浏览文件 @
d231e550
...
...
@@ -27,6 +27,7 @@ from .tensor import concat
from
.
import
utils
from
..
import
unique_name
from
functools
import
reduce
from
..
import
core
__all__
=
[
'fc'
,
...
...
@@ -101,6 +102,7 @@ __all__ = [
'image_resize'
,
'image_resize_short'
,
'resize_bilinear'
,
'resize_nearest'
,
'gather'
,
'scatter'
,
'sequence_scatter'
,
...
...
@@ -158,6 +160,7 @@ __all__ = [
'affine_grid'
,
'sequence_reverse'
,
'affine_channel'
,
'similarity_focus'
,
'hash'
,
'grid_sampler'
,
'log_loss'
,
...
...
@@ -1665,6 +1668,20 @@ def conv2d(input,
pre_bias
=
helper
.
create_variable_for_type_inference
(
dtype
)
if
use_cudnn
:
helper
.
create_variable
(
name
=
"kCUDNNFwdAlgoCache"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
helper
.
create_variable
(
name
=
"kCUDNNBwdDataAlgoCache"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
helper
.
create_variable
(
name
=
"kCUDNNBwdFilterAlgoCache"
,
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
helper
.
append_op
(
type
=
l_type
,
inputs
=
{
...
...
@@ -1678,7 +1695,7 @@ def conv2d(input,
'dilations'
:
dilation
,
'groups'
:
groups
,
'use_cudnn'
:
use_cudnn
,
'use_mkldnn'
:
False
'use_mkldnn'
:
False
,
})
pre_act
=
helper
.
append_bias_op
(
pre_bias
,
dim_start
=
1
,
dim_end
=
2
)
...
...
@@ -5640,7 +5657,8 @@ def image_resize(input,
out_shape
=
None
,
scale
=
None
,
name
=
None
,
resample
=
'BILINEAR'
):
resample
=
'BILINEAR'
,
actual_shape
=
None
):
"""
**Resize a Batch of Images**
...
...
@@ -5650,6 +5668,7 @@ def image_resize(input,
Supporting resample methods:
'BILINEAR' : Bilinear interpolation
'NEAREST' : Nearest neighbor interpolation
Args:
input (Variable): The input tensor of image resize layer,
...
...
@@ -5664,25 +5683,51 @@ def image_resize(input,
Default: None
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
resample(str): The resample method. It can only be 'BILINEAR' currently.
resample(str): The resample method. It supports 'BILINEAR' and 'NEAREST'
currently.
Default: 'BILINEAR'
actual_shape(Variable): An optional input to specify output shape
dynamically. If provided, image resize
according to this given shape rather than
:attr:`out_shape` and :attr:`scale` specifying
shape. That is to say actual_shape has the
highest priority. It is recommended to use
actual_shape instead of :attr:`out_shape` if you
want to specify output shape dynamically. When
using actual_shape to specify output shape, one of
:attr:`out_shape` and :attr:`scale` should also be
set, otherwise errors would be occured in graph
constructing stage.
Default: None
Returns:
Variable: The output is a 4-D tensor of the shape
(num_batches, channls, out_h, out_w).
Raises:
TypeError: out_shape should be a list or tuple or Variable.
TypeError: actual_shape should either be Variable or None.
ValueError: The 'resample' of image_resize can only be 'BILINEAR'
or 'NEAREST' currently.
ValueError: One of out_shape and scale must not be None.
ValueError: out_shape length should be 2.
Examples:
.. code-block:: python
out = fluid.layers.image_resize(input, out_shape=[12, 12])
"""
resample_methods
=
{
'BILINEAR'
:
'bilinear_interp'
}
resample_methods
=
{
'BILINEAR'
:
'bilinear'
,
'NEAREST'
:
'nearest'
,
}
if
resample
not
in
resample_methods
:
raise
ValueError
(
"The 'resample' of image_resize can only be 'BILINEAR' currently."
)
"The 'resample' of image_resize can only be 'BILINEAR' or 'NEAREST' currently."
)
if
out_shape
is
None
and
scale
is
None
:
raise
ValueError
(
"One of out_shape and scale must not be None"
)
helper
=
LayerHelper
(
'
bilinear_interp
'
,
**
locals
())
raise
ValueError
(
"One of out_shape and scale must not be None
.
"
)
helper
=
LayerHelper
(
'
interpolate
'
,
**
locals
())
dtype
=
helper
.
input_dtype
()
def
_is_list_or_turple_
(
data
):
...
...
@@ -5692,33 +5737,106 @@ def image_resize(input,
out_w
=
0
inputs
=
{
"X"
:
input
}
if
out_shape
is
not
None
:
if
not
(
_is_list_or_turple_
(
out_shape
)
and
len
(
out_shape
)
==
2
)
and
not
isinstance
(
out_shape
,
Variable
):
raise
ValueError
(
'out_shape should be a list or tuple or variable'
)
if
_is_list_or_turple_
(
out_shape
):
out_shape
=
list
(
map
(
int
,
out_shape
))
out_h
=
out_shape
[
0
]
out_w
=
out_shape
[
1
]
else
:
if
isinstance
(
out_shape
,
Variable
):
warnings
.
warn
(
"out_shape as Variable type is deprecated,
\
it is recommended to use actual_shape instead of
\
out_shape to specify output shape dynamically."
)
inputs
[
'OutSize'
]
=
out_shape
elif
not
(
_is_list_or_turple_
(
out_shape
)):
raise
TypeError
(
"out_shape should be a list or tuple or Variable."
)
elif
len
(
out_shape
)
!=
2
:
raise
ValueError
(
"out_shape length should be 2."
)
out_shape
=
list
(
map
(
int
,
out_shape
))
out_h
=
out_shape
[
0
]
out_w
=
out_shape
[
1
]
else
:
out_h
=
int
(
input
.
shape
[
2
]
*
scale
)
out_w
=
int
(
input
.
shape
[
3
]
*
scale
)
if
isinstance
(
actual_shape
,
Variable
):
inputs
[
"OutSize"
]
=
actual_shape
elif
actual_shape
is
not
None
:
raise
TypeError
(
"actual_shape should either be Variable or None."
)
out
=
helper
.
create_variable_for_type_inference
(
dtype
)
helper
.
append_op
(
type
=
resample_methods
[
resample
]
,
type
=
'interpolate'
,
inputs
=
inputs
,
outputs
=
{
"Out"
:
out
},
attrs
=
{
"out_h"
:
out_h
,
"out_w"
:
out_w
})
attrs
=
{
"out_h"
:
out_h
,
"out_w"
:
out_w
,
"interp_method"
:
resample_methods
[
resample
]
})
return
out
@
templatedoc
(
op_type
=
"bilinear_interp"
)
def
resize_bilinear
(
input
,
out_shape
=
None
,
scale
=
None
,
name
=
None
):
@
templatedoc
(
op_type
=
"interpolate"
)
def
resize_bilinear
(
input
,
out_shape
=
None
,
scale
=
None
,
name
=
None
,
actual_shape
=
None
):
"""
${comment}
Resize input by performing bilinear interpolation based on given
output shape which specified by actual_shape, out_shape and scale
in priority order.
Bilinear interpolation is an extension of linear interpolation for
interpolating functions of two variables (e.g. H-direction and
W-direction in this op) on a rectilinear 2D grid. The key idea is
to perform linear interpolation first in one direction, and then
again in the other direction.
For details of bilinear interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Bilinear_interpolation
Args:
input(${x_type}): ${x_comment}.
out_shape(${out_size_type}): ${out_size_comment}.
scale(float|None): The multiplier for the input height or width. At
least one of out_shape or scale must be set. And out_shape has
a higher priority than scale. Default: None.
name(str|None): The output variable name.
actual_shape(Variable): An optional input to specify output shape
dynamically. If provided, image resize
according to this given shape rather than
:attr:`out_shape` and :attr:`scale` specifying
shape. That is to say actual_shape has the
highest priority. It is recommended to use
actual_shape instead of :attr:`out_shape` if you
want to specify output shape dynamically. When
using actual_shape to specify output shape, one of
:attr:`out_shape` and :attr:`scale` should also be
set, otherwise errors would be occured in graph
constructing stage.
Default: None
Returns:
${out_comment}.
"""
return
image_resize
(
input
,
out_shape
,
scale
,
name
,
'BILINEAR'
,
actual_shape
)
@
templatedoc
(
op_type
=
"interpolate"
)
def
resize_nearest
(
input
,
out_shape
=
None
,
scale
=
None
,
name
=
None
,
actual_shape
=
None
):
"""
Resize input by performing nearest neighbor interpolation in both the
3rd dimention(in height direction) and the 4th dimention(in width
direction) based on given output shape which specified by actual_shape,
out_shape and scale in priority order.
For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
Args:
input(${x_type}): ${x_comment}.
...
...
@@ -5730,12 +5848,25 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
a higher priority than scale. Default: None.
name(str|None): The output variable name.
actual_shape(Variable): An optional input to specify output shape
dynamically. If provided, image resize
according to this given shape rather than
:attr:`out_shape` and :attr:`scale` specifying
shape. That is to say actual_shape has the
highest priority. It is recommended to use
actual_shape instead of :attr:`out_shape` if you
want to specify output shape dynamically. When
using actual_shape to specify output shape, one of
:attr:`out_shape` and :attr:`scale` should also be
set, otherwise errors would be occured in graph
constructing stage.
Default: None
Returns:
${out_comment}.
"""
return
image_resize
(
input
,
out_shape
,
scale
,
name
,
'
BILINEAR'
)
return
image_resize
(
input
,
out_shape
,
scale
,
name
,
'
NEAREST'
,
actual_shape
)
def
image_resize_short
(
input
,
out_short_len
,
resample
=
'BILINEAR'
):
...
...
@@ -7803,6 +7934,118 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
return
out
def
similarity_focus
(
input
,
axis
,
indexes
,
name
=
None
):
"""
SimilarityFocus Operator
Generate a similarity focus mask with the same shape of input using the following method:
1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
to the axis according to the indexes. For example, if axis=1 and indexes=[a],
it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
2. For each index, find the largest numbers in the tensor T, so that the same
row and same column has at most one number(what it means is that if the
largest number has been found in the i-th row and the j-th column, then
the numbers in the i-th row or j-th column will be skipped. And then the
next largest number will be selected from the remaining numbers. Obviously
there will be min(B, C) numbers), and mark the corresponding position of the
3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for
each index.
3. Broadcast the 3-D similarity focus mask to the same shape of input X.
Refer to `Similarity Focus Layer <http://www.aclweb.org/anthology/N16-1108>`_
.. code-block:: text
* Example :
Given a 4-D tensor x with the shape (BatchSize, C, A, B), where C is
the number of channels and the shape of feature map is (A, B):
x.shape = (2, 3, 2, 2)
x.data = [[[[0.8, 0.1],
[0.4, 0.5]],
[[0.9, 0.7],
[0.9, 0.9]],
[[0.8, 0.9],
[0.1, 0.2]]],
[[[0.2, 0.5],
[0.3, 0.4]],
[[0.9, 0.7],
[0.8, 0.4]],
[[0.0, 0.2],
[0.4, 0.7]]]]
Given axis: 1 (the axis of the channel)
Given indexes: [0]
then we get a 4-D tensor out with the same shape of input x:
out.shape = (2, 3, 2, 2)
out.data = [[[[1.0, 0.0],
[0.0, 1.0]],
[[1.0, 0.0],
[0.0, 1.0]],
[[1.0, 0.0],
[0.0, 1.0]]],
[[[0.0, 1.0],
[1.0, 0.0]],
[[0.0, 1.0],
[1.0, 0.0]],
[[0.0, 1.0],
[1.0, 0.0]]]]
Args:
input(Variable): The input tensor variable(default float). It should
be a 4-D tensor with shape [BatchSize, A, B, C].
axis(int): Indicating the dimension to be selected. It can only be
1, 2 or 3.
indexes(list): Indicating the indexes of the selected dimension.
Returns:
Variable: A tensor variable with the same shape and same type
as the input.
Examples:
.. code-block:: python
data = fluid.layers.data(
name='data', shape=[2, 3, 2, 2], dtype='float32')
x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0])
"""
helper
=
LayerHelper
(
'similarity_focus'
,
**
locals
())
# check attrs
if
isinstance
(
axis
,
int
)
is
False
:
raise
TypeError
(
"axis must be int type."
)
if
isinstance
(
indexes
,
list
)
is
False
:
raise
TypeError
(
"indexes must be list type."
)
if
axis
!=
1
and
axis
!=
2
and
axis
!=
3
:
raise
ValueError
(
"axis must be 1, 2 or 3."
)
if
len
(
indexes
)
==
0
:
raise
ValueError
(
"indexes can not be empty."
)
if
name
is
None
:
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
input
.
dtype
)
else
:
out
=
helper
.
create_variable
(
name
=
name
,
dtype
=
input
.
dtype
,
persistable
=
False
)
helper
.
append_op
(
type
=
'similarity_focus'
,
inputs
=
{
'X'
:
input
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
"axis"
:
axis
,
"indexes"
:
indexes
})
return
out
def
hash
(
input
,
hash_size
,
num_hash
=
1
,
name
=
None
):
"""
Hash the input to an integer whose value is less than the given hash size.
...
...
python/paddle/fluid/layers/tensor.py
浏览文件 @
d231e550
...
...
@@ -24,10 +24,10 @@ from .layer_function_generator import templatedoc
import
numpy
__all__
=
[
'create_tensor'
,
'create_parameter'
,
'create_global_var'
,
'cast'
,
'concat'
,
'
sums'
,
'assign'
,
'fill_constant_batch_size_like'
,
'fill_constant
'
,
'
argmin'
,
'argmax'
,
'argsort'
,
'ones'
,
'zeros'
,
'reverse'
,
'has_inf
'
,
'has_nan'
,
'isfinite'
'create_tensor'
,
'create_parameter'
,
'create_global_var'
,
'cast'
,
'
tensor_array_to_tensor'
,
'concat'
,
'sums'
,
'assign
'
,
'
fill_constant_batch_size_like'
,
'fill_constant'
,
'argmin'
,
'argmax
'
,
'
argsort'
,
'ones'
,
'zeros'
,
'reverse'
,
'has_inf'
,
'
has_nan'
,
'isfinite'
]
...
...
@@ -193,6 +193,60 @@ def concat(input, axis=0, name=None):
return
out
def
tensor_array_to_tensor
(
input
,
axis
=
1
,
name
=
None
):
"""
This function concatenates the input LodTensorArray along the axis mentioned
and returns that as the output.
A simple example as below:
.. code-block:: text
Given:
input.data = {[[0.6, 0.1, 0.3],
[0.5, 0.3, 0.2]],
[[1.3],
[1.8]],
[[2.3, 2.1],
[2.5, 2.4]]}
axis = 1
Then:
output.data = [[0.6, 0.1, 0.3, 1.3, 2.3, 2.1],
[0.5, 0.3, 0.2, 1.8, 2.5, 2.4]]
output_index.data = [3, 1, 2]
Args:
input(list): Input LodTensorArray
axis(int): Integer axis along which the tensors will be concatenated
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: Output variable of the concatenation
Variable: The input LodTensorArray items' dims along the axis
Examples:
.. code-block:: python
output, output_index = fluid.layers.tensor_array_to_tensor(input=tensor_array)
"""
helper
=
LayerHelper
(
'tensor_array_concat'
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
helper
.
input_dtype
())
out_index
=
helper
.
create_variable_for_type_inference
(
dtype
=
"int32"
)
helper
.
append_op
(
type
=
'tensor_array_concat'
,
inputs
=
{
'X'
:
input
},
outputs
=
{
'Out'
:
[
out
],
'OutIndex'
:
[
out_index
]},
attrs
=
{
'axis'
:
axis
})
return
out
,
out_index
def
sums
(
input
,
out
=
None
):
"""
This function performs the sum operation on the input and returns the
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
d231e550
...
...
@@ -13,21 +13,23 @@
# limitations under the License.
from
__future__
import
print_function
import
re
import
sys
from
collections
import
defaultdict
from
contextlib
import
contextmanager
from
paddle.fluid.framework
import
Program
,
Variable
,
name_scope
,
default_main_program
from
paddle.fluid.distribute_lookup_table
import
find_distributed_lookup_table
from
.
import
framework
from
.
import
layers
from
.
import
unique_name
from
.backward
import
append_backward
from
.clip
import
append_gradient_clip_ops
,
error_clip_callback
from
.framework
import
program_guard
from
.
import
unique_name
from
.initializer
import
Constant
from
.layer_helper
import
LayerHelper
from
.regularizer
import
append_regularization_ops
from
.clip
import
append_gradient_clip_ops
,
error_clip_callback
from
contextlib
import
contextmanager
from
.layers
import
ops
from
.regularizer
import
append_regularization_ops
__all__
=
[
'SGD'
,
'Momentum'
,
'Adagrad'
,
'Adam'
,
'Adamax'
,
'DecayedAdagrad'
,
'Ftrl'
,
...
...
@@ -85,7 +87,7 @@ class Optimizer(object):
name
=
unique_name
.
generate
(
"learning_rate"
),
shape
=
[
1
],
value
=
float
(
self
.
_learning_rate
),
dtype
=
'float32'
if
self
.
_dtype
==
None
else
self
.
_dtype
,
dtype
=
'float32'
if
self
.
_dtype
is
None
else
self
.
_dtype
,
persistable
=
True
)
def
_global_learning_rate
(
self
,
program
=
None
):
...
...
@@ -245,6 +247,50 @@ class Optimizer(object):
end
=
len
(
global_block
.
ops
)
return
global_block
.
_slice_ops
(
start
,
end
)
def
_process_distribute_lookuptable
(
self
,
param_grads
,
loss
,
startup_program
):
"""
Because distribute lookup table only support SGD optimizer for now, not support
other optimizer and regularization, so we should find the table parameter out,
and avoid to add regularization and other op for it, and add sgd optimize op
for it independently.
:param param_grads(list((Var, Var))): list of (param, grad) pair.
:param loss: the loss variable.
:param startup_program: the startup program
"""
program
=
loss
.
block
.
program
table_name
=
find_distributed_lookup_table
(
program
)
table_param
=
None
table_grad
=
None
new_param_grads
=
[]
for
p
,
g
in
param_grads
:
if
p
.
name
==
table_name
:
if
table_param
is
not
None
:
raise
RuntimeError
(
"multi dist table var found, only support one now!"
)
table_param
=
p
table_grad
=
g
else
:
new_param_grads
.
append
((
p
,
g
))
sgd_op
=
None
if
table_param
is
not
None
:
with
program_guard
(
program
,
startup_program
):
param_and_grad
=
[
table_param
,
table_grad
]
with
table_param
.
block
.
program
.
_optimized_guard
(
param_and_grad
),
\
framework
.
name_scope
(
"optimizer"
):
self
.
_create_global_learning_rate
()
# create the optimize op
sgd_op
=
loss
.
block
.
append_op
(
type
=
'sgd'
,
inputs
=
{
"Param"
:
table_param
,
"Grad"
:
table_grad
,
"LearningRate"
:
self
.
_create_param_lr
(
param_and_grad
)
},
outputs
=
{
"ParamOut"
:
param_and_grad
[
0
]})
return
new_param_grads
,
(
table_param
,
table_grad
),
sgd_op
def
minimize
(
self
,
loss
,
startup_program
=
None
,
...
...
@@ -260,6 +306,9 @@ class Optimizer(object):
params_grads
=
sorted
(
params_grads
,
key
=
lambda
x
:
x
[
0
].
name
)
params_grads
,
table_param_and_grad
,
table_optimize_op
=
\
self
.
_process_distribute_lookuptable
(
params_grads
,
loss
,
startup_program
)
params_grads
=
append_gradient_clip_ops
(
params_grads
)
# Add regularization if any
...
...
@@ -268,6 +317,9 @@ class Optimizer(object):
optimize_ops
=
self
.
_create_optimization_pass
(
params_grads
,
loss
,
startup_program
)
if
table_optimize_op
is
not
None
:
optimize_ops
.
append
(
table_optimize_op
)
params_grads
.
append
(
table_param_and_grad
)
return
optimize_ops
,
params_grads
...
...
python/paddle/fluid/tests/book/test_label_semantic_roles.py
浏览文件 @
d231e550
...
...
@@ -38,7 +38,7 @@ depth = 8
mix_hidden_lr
=
1e-3
IS_SPARSE
=
True
PASS_NUM
=
1
0
PASS_NUM
=
1
BATCH_SIZE
=
10
embedding_name
=
'emb'
...
...
python/paddle/fluid/tests/unittests/test_conv2d_op.py
浏览文件 @
d231e550
...
...
@@ -67,6 +67,7 @@ class TestConv2dOp(OpTest):
def
setUp
(
self
):
self
.
op_type
=
"conv2d"
self
.
use_cudnn
=
False
self
.
exhaustive_search
=
False
self
.
use_cuda
=
False
self
.
use_mkldnn
=
False
self
.
data_format
=
"AnyLayout"
...
...
@@ -98,7 +99,8 @@ class TestConv2dOp(OpTest):
'dilations'
:
self
.
dilations
,
'use_cudnn'
:
self
.
use_cudnn
,
'use_mkldnn'
:
self
.
use_mkldnn
,
'data_format'
:
self
.
data_format
'data_format'
:
self
.
data_format
,
'exhaustive_search'
:
self
.
exhaustive_search
}
self
.
outputs
=
{
'Output'
:
output
}
...
...
@@ -361,6 +363,12 @@ class TestDepthwiseConvWithDilation2(TestConv2dOp):
self
.
op_type
=
"depthwise_conv2d"
class
TestCUDNNExhaustiveSearch
(
TestConv2dOp
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
exhaustive_search
=
True
# Please Don't remove the following code.
# Currently, CI use cudnn V5.0 which not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation):
...
...
python/paddle/fluid/tests/unittests/test_conv3d_op.py
浏览文件 @
d231e550
...
...
@@ -335,6 +335,12 @@ class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
self
.
check_output_with_place
(
place
,
atol
=
2e-2
)
class
TestCUDNNExhaustiveSearch
(
TestCUDNN
):
def
init_kernel_type
(
self
):
self
.
use_cudnn
=
True
self
.
exhaustive_search
=
True
# FIXME(typhoonzero): find a way to determine if
# using cudnn > 6 in python
# class TestWithDilationCUDNN(TestWithDilation):
...
...
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
浏览文件 @
d231e550
...
...
@@ -567,7 +567,6 @@ class TestDistLookupTable(TestDistLookupTableBase):
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'uniform_random'
,
'uniform_random'
,
'recv'
,
'recv'
,
'recv'
,
'fetch_barrier'
,
'concat'
,
'fake_init'
...
...
@@ -639,7 +638,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
# 5 save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
5
].
ops
],
[
"save"
])
trainer
,
_
=
self
.
get_trainer
(
config
)
trainer
,
trainer_startup
=
self
.
get_trainer
(
config
)
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'split_ids'
,
'prefetch'
,
'merge_ids'
,
'sequence_pool'
,
...
...
@@ -653,6 +652,16 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
'recv'
,
'concat'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
startup_ops
=
[
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'uniform_random'
,
'uniform_random'
,
'recv'
,
'recv'
,
'recv'
,
'fetch_barrier'
,
'concat'
,
'fake_init'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer_startup
.
blocks
[
0
].
ops
],
startup_ops
)
class
TestDistLookupTableSliceSize
(
TestDistLookupTableBase
):
...
...
python/paddle/fluid/tests/unittests/test_
bilinear_interp
_op.py
→
python/paddle/fluid/tests/unittests/test_
interpolate
_op.py
浏览文件 @
d231e550
...
...
@@ -20,10 +20,44 @@ from op_test import OpTest
import
paddle.fluid.core
as
core
def
bilinear_interp_np
(
input
,
out_h
,
out_w
,
out_size
):
def
nearest_neighbor_interp_np
(
X
,
out_h
,
out_w
,
out_size
=
None
,
actual_shape
=
None
):
"""nearest neighbor interpolation implement in shape [N, C, H, W]"""
if
out_size
is
not
None
:
out_h
=
out_size
[
0
]
out_w
=
out_size
[
1
]
if
actual_shape
is
not
None
:
out_h
=
actual_shape
[
0
]
out_w
=
actual_shape
[
1
]
n
,
c
,
in_h
,
in_w
=
X
.
shape
ratio_h
=
ratio_w
=
0.0
if
out_h
>
1
:
ratio_h
=
(
in_h
-
1.0
)
/
(
out_h
-
1.0
)
if
out_w
>
1
:
ratio_w
=
(
in_w
-
1.0
)
/
(
out_w
-
1.0
)
out
=
np
.
zeros
((
n
,
c
,
out_h
,
out_w
))
for
i
in
range
(
out_h
):
in_i
=
int
(
ratio_h
*
i
+
0.5
)
for
j
in
range
(
out_w
):
in_j
=
int
(
ratio_w
*
j
+
0.5
)
out
[:,
:,
i
,
j
]
=
X
[:,
:,
in_i
,
in_j
]
return
out
.
astype
(
X
.
dtype
)
def
bilinear_interp_np
(
input
,
out_h
,
out_w
,
out_size
=
None
,
actual_shape
=
None
):
"""bilinear interpolation implement in shape [N, C, H, W]"""
if
out_size
is
not
None
:
out_h
=
out_size
[
0
]
out_w
=
out_size
[
1
]
if
actual_shape
is
not
None
:
out_h
=
actual_shape
[
0
]
out_w
=
actual_shape
[
1
]
batch_size
,
channel
,
in_h
,
in_w
=
input
.
shape
if
out_h
>
1
:
ratio_h
=
(
in_h
-
1.0
)
/
(
out_h
-
1.0
)
...
...
@@ -53,18 +87,32 @@ def bilinear_interp_np(input, out_h, out_w, out_size):
return
out
.
astype
(
input
.
dtype
)
class
TestBilinearInterpOp
(
OpTest
):
INTERPOLATE_FUNCS
=
{
'bilinear'
:
bilinear_interp_np
,
'nearest'
:
nearest_neighbor_interp_np
,
}
class
TestInterpolateOp
(
OpTest
):
def
setUp
(
self
):
self
.
out_size
=
None
self
.
actual_shape
=
None
self
.
init_test_case
()
self
.
op_type
=
"
bilinear_interp
"
self
.
op_type
=
"
interpolate
"
input_np
=
np
.
random
.
random
(
self
.
input_shape
).
astype
(
"float32"
)
output_np
=
bilinear_interp_np
(
input_np
,
self
.
out_h
,
self
.
out_w
,
self
.
out_size
)
output_np
=
INTERPOLATE_FUNCS
[
self
.
interp_method
](
input_np
,
self
.
out_h
,
self
.
out_w
,
self
.
out_size
,
self
.
actual_shape
)
self
.
inputs
=
{
'X'
:
input_np
}
if
self
.
out_size
is
not
None
:
self
.
inputs
[
'OutSize'
]
=
self
.
out_size
self
.
attrs
=
{
'out_h'
:
self
.
out_h
,
'out_w'
:
self
.
out_w
}
if
self
.
actual_shape
is
not
None
:
self
.
inputs
[
'OutSize'
]
=
self
.
actual_shape
self
.
attrs
=
{
'out_h'
:
self
.
out_h
,
'out_w'
:
self
.
out_w
,
'interp_method'
:
self
.
interp_method
}
self
.
outputs
=
{
'Out'
:
output_np
}
def
test_check_output
(
self
):
...
...
@@ -74,90 +122,209 @@ class TestBilinearInterpOp(OpTest):
self
.
check_grad
([
'X'
],
'Out'
,
in_place
=
True
)
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
2
,
3
,
4
,
4
]
self
.
out_h
=
2
self
.
out_w
=
2
self
.
out_size
=
np
.
array
([
3
,
3
]).
astype
(
"int32"
)
class
Test
Case1
(
TestBilinearInterp
Op
):
class
Test
BilinearInterpCase1
(
TestInterpolate
Op
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
out_h
=
1
self
.
out_w
=
1
class
Test
Case2
(
TestBilinearInterp
Op
):
class
Test
BilinearInterpCase2
(
TestInterpolate
Op
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
3
,
3
,
9
,
6
]
self
.
out_h
=
12
self
.
out_w
=
12
class
Test
Case3
(
TestBilinearInterp
Op
):
class
Test
BilinearInterpCase3
(
TestInterpolate
Op
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
1
,
1
,
128
,
64
]
self
.
out_h
=
64
self
.
out_w
=
128
class
Test
Case4
(
TestBilinearInterp
Op
):
class
Test
BilinearInterpCase4
(
TestInterpolate
Op
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
out_h
=
1
self
.
out_w
=
1
self
.
out_size
=
np
.
array
([
2
,
2
]).
astype
(
"int32"
)
class
Test
Case5
(
TestBilinearInterp
Op
):
class
Test
BilinearInterpCase5
(
TestInterpolate
Op
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
3
,
3
,
9
,
6
]
self
.
out_h
=
12
self
.
out_w
=
12
self
.
out_size
=
np
.
array
([
11
,
11
]).
astype
(
"int32"
)
class
Test
Case6
(
TestBilinearInterp
Op
):
class
Test
BilinearInterpCase6
(
TestInterpolate
Op
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
1
,
1
,
128
,
64
]
self
.
out_h
=
64
self
.
out_w
=
128
self
.
out_size
=
np
.
array
([
65
,
129
]).
astype
(
"int32"
)
class
TestBilinearInterpOpUint8
(
OpTest
):
class
TestBilinearInterpActualShape
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
3
,
2
,
32
,
16
]
self
.
out_h
=
64
self
.
out_w
=
32
self
.
out_size
=
np
.
array
([
66
,
40
]).
astype
(
"int32"
)
class
TestBilinearInterpBigScale
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
4
,
4
,
64
,
32
]
self
.
out_h
=
100
self
.
out_w
=
50
self
.
out_size
=
np
.
array
([
101
,
51
]).
astype
(
'int32'
)
class
TestInterpolateOpUint8
(
OpTest
):
def
setUp
(
self
):
self
.
out_size
=
None
self
.
actual_shape
=
None
self
.
init_test_case
()
self
.
op_type
=
"
bilinear_interp
"
self
.
op_type
=
"
interpolate
"
input_np
=
np
.
random
.
randint
(
low
=
0
,
high
=
256
,
size
=
self
.
input_shape
).
astype
(
"uint8"
)
output_np
=
bilinear_interp_np
(
input_np
,
self
.
out_h
,
self
.
out_w
,
self
.
out_siz
e
)
output_np
=
INTERPOLATE_FUNCS
[
self
.
interp_method
](
input_np
,
self
.
out_h
,
self
.
out_w
,
self
.
out_size
,
self
.
actual_shap
e
)
self
.
inputs
=
{
'X'
:
input_np
}
if
self
.
out_size
is
not
None
:
self
.
inputs
[
'OutSize'
]
=
self
.
out_size
self
.
attrs
=
{
'out_h'
:
self
.
out_h
,
'out_w'
:
self
.
out_w
}
self
.
attrs
=
{
'out_h'
:
self
.
out_h
,
'out_w'
:
self
.
out_w
,
'interp_method'
:
self
.
interp_method
}
self
.
outputs
=
{
'Out'
:
output_np
}
def
test_check_output
(
self
):
self
.
check_output_with_place
(
place
=
core
.
CPUPlace
(),
atol
=
1
)
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
1
,
3
,
9
,
6
]
self
.
out_h
=
10
self
.
out_w
=
9
class
TestCase1Uint8
(
TestBilinearInterpOpUint8
):
class
TestBilinearInterpCase1Uint8
(
TestInterpolateOpUint8
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
2
,
3
,
128
,
64
]
self
.
out_h
=
120
self
.
out_w
=
50
class
TestBilinearInterpCase2Uint8
(
TestInterpolateOpUint8
):
def
init_test_case
(
self
):
self
.
interp_method
=
'bilinear'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
out_h
=
5
self
.
out_w
=
13
self
.
out_size
=
np
.
array
([
6
,
15
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpCase1
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
out_h
=
1
self
.
out_w
=
1
class
TestNearestNeighborInterpCase2
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
3
,
3
,
9
,
6
]
self
.
out_h
=
12
self
.
out_w
=
12
class
TestNearestNeighborInterpCase3
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
1
,
1
,
128
,
64
]
self
.
out_h
=
64
self
.
out_w
=
128
class
TestNearestNeighborInterpCase4
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
out_h
=
1
self
.
out_w
=
1
self
.
out_size
=
np
.
array
([
2
,
2
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpCase5
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
3
,
3
,
9
,
6
]
self
.
out_h
=
12
self
.
out_w
=
12
self
.
out_size
=
np
.
array
([
11
,
11
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpCase6
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
1
,
1
,
128
,
64
]
self
.
out_h
=
64
self
.
out_w
=
128
self
.
out_size
=
np
.
array
([
65
,
129
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpActualShape
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
3
,
2
,
32
,
16
]
self
.
out_h
=
64
self
.
out_w
=
32
self
.
out_size
=
np
.
array
([
66
,
40
]).
astype
(
"int32"
)
class
TestNearestNeighborInterpBigScale
(
TestInterpolateOp
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
4
,
4
,
64
,
32
]
self
.
out_h
=
100
self
.
out_w
=
50
self
.
out_size
=
np
.
array
([
101
,
51
]).
astype
(
'int32'
)
class
TestNearestNeighborInterpCase1Uint8
(
TestInterpolateOpUint8
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
2
,
3
,
128
,
64
]
self
.
out_h
=
120
self
.
out_w
=
50
class
Test
Case2Uint8
(
TestBilinearInterp
OpUint8
):
class
Test
NearestNeighborInterpCase2Uint8
(
TestInterpolate
OpUint8
):
def
init_test_case
(
self
):
self
.
interp_method
=
'nearest'
self
.
input_shape
=
[
4
,
1
,
7
,
8
]
self
.
out_h
=
5
self
.
out_w
=
13
...
...
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
d231e550
...
...
@@ -496,6 +496,16 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
def
test_resize_nearest
(
self
):
program
=
Program
()
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
3
,
9
,
6
],
dtype
=
"float32"
)
output
=
layers
.
resize_nearest
(
x
,
out_shape
=
[
12
,
12
])
self
.
assertIsNotNone
(
output
)
output
=
layers
.
resize_nearest
(
x
,
scale
=
3
)
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
def
test_polygon_box_transform
(
self
):
program
=
Program
()
with
program_guard
(
program
):
...
...
python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
0 → 100755
浏览文件 @
d231e550
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
paddle.fluid.core
as
core
from
op_test
import
OpTest
class
TestSimilarityFocusOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"similarity_focus"
batch_size
=
2
x_dim
,
y_dim
,
z_dim
=
3
,
2
,
2
self
.
inputs
=
{
'X'
:
np
.
array
([[[[
0.8
,
0.1
],
[
0.4
,
0.5
]],
[[
0.9
,
0.7
],
[
0.9
,
0.9
]],
[[
0.8
,
0.9
],
[
0.1
,
0.2
]]],
[[[
0.2
,
0.5
],
[
0.3
,
0.4
]],
[[
0.9
,
0.7
],
[
0.8
,
0.4
]],
[[
0.0
,
0.2
],
[
0.4
,
0.7
]]]]),
}
self
.
attrs
=
{
'axis'
:
1
,
'indexes'
:
[
0
],
}
output
=
None
for
batch
in
range
(
batch_size
):
res
=
np
.
zeros
((
1
,
y_dim
,
z_dim
)).
astype
(
"float32"
).
reshape
(
-
1
)
for
index
in
self
.
attrs
[
'indexes'
]:
channel
=
self
.
inputs
[
'X'
][
batch
,
index
,
:,
:].
reshape
(
-
1
).
copy
(
)
tag1
=
[
0
for
i
in
range
(
y_dim
)]
tag2
=
[
0
for
i
in
range
(
z_dim
)]
cnt
=
0
for
i
in
range
(
channel
.
size
):
index
=
channel
.
argmax
()
idx1
=
index
//
z_dim
idx2
=
index
%
z_dim
if
tag1
[
idx1
]
+
tag2
[
idx2
]
==
0
:
tag1
[
idx1
]
=
1
tag2
[
idx2
]
=
1
res
[
index
]
=
1
cnt
+=
1
if
cnt
==
min
(
y_dim
,
z_dim
):
break
channel
[
index
]
=
-
1
res
=
res
.
reshape
(
1
,
y_dim
,
z_dim
).
repeat
([
x_dim
],
axis
=
0
)
res
=
res
.
reshape
(
1
,
x_dim
,
y_dim
,
z_dim
)
if
output
is
not
None
:
output
=
np
.
concatenate
((
output
,
res
),
axis
=
0
)
else
:
output
=
res
self
.
outputs
=
{
'Out'
:
output
}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestSimilarityFocusOp_axis1
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"similarity_focus"
batch_size
=
3
x_dim
,
y_dim
,
z_dim
=
4
,
5
,
6
self
.
inputs
=
{
'X'
:
np
.
random
.
random
(
(
batch_size
,
x_dim
,
y_dim
,
z_dim
)).
astype
(
"float32"
),
}
self
.
attrs
=
{
'axis'
:
1
,
'indexes'
:
[
0
,
3
],
}
output
=
None
for
batch
in
range
(
batch_size
):
res
=
np
.
zeros
((
1
,
y_dim
,
z_dim
)).
astype
(
"float32"
).
reshape
(
-
1
)
for
index
in
self
.
attrs
[
'indexes'
]:
channel
=
self
.
inputs
[
'X'
][
batch
,
index
,
:,
:].
reshape
(
-
1
).
copy
(
)
tag1
=
[
0
for
i
in
range
(
y_dim
)]
tag2
=
[
0
for
i
in
range
(
z_dim
)]
cnt
=
0
for
i
in
range
(
channel
.
size
):
index
=
channel
.
argmax
()
idx1
=
index
//
z_dim
idx2
=
index
%
z_dim
if
tag1
[
idx1
]
+
tag2
[
idx2
]
==
0
:
tag1
[
idx1
]
=
1
tag2
[
idx2
]
=
1
res
[
index
]
=
1
cnt
+=
1
if
cnt
==
min
(
y_dim
,
z_dim
):
break
channel
[
index
]
=
-
1
res
=
res
.
reshape
(
1
,
y_dim
,
z_dim
)
res
=
res
.
repeat
([
x_dim
],
axis
=
0
)
res
=
res
.
reshape
(
1
,
x_dim
,
y_dim
,
z_dim
)
if
output
is
not
None
:
output
=
np
.
concatenate
((
output
,
res
),
axis
=
0
)
else
:
output
=
res
self
.
outputs
=
{
'Out'
:
output
}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestSimilarityFocusOp_axis2
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"similarity_focus"
batch_size
=
6
x_dim
,
y_dim
,
z_dim
=
7
,
8
,
9
self
.
inputs
=
{
'X'
:
np
.
random
.
random
(
(
batch_size
,
x_dim
,
y_dim
,
z_dim
)).
astype
(
"float32"
),
}
self
.
attrs
=
{
'axis'
:
2
,
'indexes'
:
[
0
,
3
,
5
],
}
output
=
None
for
batch
in
range
(
batch_size
):
res
=
np
.
zeros
((
x_dim
,
1
,
z_dim
)).
astype
(
"float32"
).
reshape
(
-
1
)
for
index
in
self
.
attrs
[
'indexes'
]:
channel
=
self
.
inputs
[
'X'
][
batch
,
:,
index
,
:].
reshape
(
-
1
).
copy
(
)
tag1
=
[
0
for
i
in
range
(
x_dim
)]
tag2
=
[
0
for
i
in
range
(
z_dim
)]
cnt
=
0
for
i
in
range
(
channel
.
size
):
index
=
channel
.
argmax
()
idx1
=
index
//
z_dim
idx2
=
index
%
z_dim
if
tag1
[
idx1
]
+
tag2
[
idx2
]
==
0
:
tag1
[
idx1
]
=
1
tag2
[
idx2
]
=
1
res
[
index
]
=
1
cnt
+=
1
if
cnt
==
min
(
x_dim
,
z_dim
):
break
channel
[
index
]
=
-
1
res
=
res
.
reshape
(
x_dim
,
1
,
z_dim
)
res
=
res
.
repeat
([
y_dim
],
axis
=
1
)
res
=
res
.
reshape
(
1
,
x_dim
,
y_dim
,
z_dim
)
if
output
is
not
None
:
output
=
np
.
concatenate
((
output
,
res
),
axis
=
0
)
else
:
output
=
res
self
.
outputs
=
{
'Out'
:
output
}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestSimilarityFocusOp_axis3
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"similarity_focus"
batch_size
=
64
x_dim
,
y_dim
,
z_dim
=
48
,
48
,
13
self
.
inputs
=
{
'X'
:
np
.
random
.
random
(
(
batch_size
,
x_dim
,
y_dim
,
z_dim
)).
astype
(
"float32"
),
}
self
.
attrs
=
{
'axis'
:
3
,
'indexes'
:
[
0
,
2
,
7
,
9
],
}
output
=
None
for
batch
in
range
(
batch_size
):
res
=
np
.
zeros
((
x_dim
,
y_dim
,
1
)).
astype
(
"float32"
).
reshape
(
-
1
)
for
index
in
self
.
attrs
[
'indexes'
]:
channel
=
self
.
inputs
[
'X'
][
batch
,
:,
:,
index
].
reshape
(
-
1
).
copy
(
)
tag1
=
[
0
for
i
in
range
(
x_dim
)]
tag2
=
[
0
for
i
in
range
(
y_dim
)]
cnt
=
0
for
i
in
range
(
channel
.
size
):
index
=
channel
.
argmax
()
idx1
=
index
//
y_dim
idx2
=
index
%
y_dim
if
tag1
[
idx1
]
+
tag2
[
idx2
]
==
0
:
tag1
[
idx1
]
=
1
tag2
[
idx2
]
=
1
res
[
index
]
=
1
cnt
+=
1
if
cnt
==
min
(
x_dim
,
y_dim
):
break
channel
[
index
]
=
-
1
res
=
res
.
reshape
(
x_dim
,
y_dim
,
1
)
res
=
res
.
repeat
([
z_dim
],
axis
=
2
)
res
=
res
.
reshape
(
1
,
x_dim
,
y_dim
,
z_dim
)
if
output
is
not
None
:
output
=
np
.
concatenate
((
output
,
res
),
axis
=
0
)
else
:
output
=
res
self
.
outputs
=
{
'Out'
:
output
}
def
test_check_output
(
self
):
self
.
check_output
()
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
0 → 100644
浏览文件 @
d231e550
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
from
paddle.fluid.executor
import
Executor
class
TestLoDTensorArrayConcat
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
op_type
=
"tensor_array_to_tensor"
self
.
attrs
=
{
"axis"
:
0
}
self
.
outputs
=
[
"Out"
]
def
test_get_set
(
self
):
scope
=
core
.
Scope
()
program
=
fluid
.
Program
()
block
=
program
.
global_block
()
input_arr
=
block
.
create_var
(
name
=
"tmp_lod_tensor_array"
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR_ARRAY
)
input_arr
.
persistable
=
True
input_arr_var
=
scope
.
var
(
'tmp_lod_tensor_array'
)
input_tensor_array
=
input_arr_var
.
get_lod_tensor_array
()
self
.
assertEqual
(
0
,
len
(
input_tensor_array
))
cpu
=
core
.
CPUPlace
()
for
i
in
range
(
10
):
t
=
core
.
LoDTensor
()
if
i
==
0
:
t
.
set
(
numpy
.
array
([[
i
],
[
i
]],
dtype
=
'float32'
),
cpu
)
else
:
t
.
set
(
numpy
.
array
([[
i
]],
dtype
=
'float32'
),
cpu
)
input_tensor_array
.
append
(
t
)
self
.
assertEqual
(
10
,
len
(
input_tensor_array
))
random_grad
=
numpy
.
random
.
random_sample
([
11
]).
astype
(
numpy
.
float32
)
y_out
=
block
.
create_var
(
name
=
"Out"
)
y_out
.
persistable
=
True
y_out_index
=
block
.
create_var
(
name
=
"OutIndex"
)
y_out_index
.
persistable
=
True
y_grad_arr
=
block
.
create_var
(
name
=
'Out@GRAD'
,
dtype
=
'float32'
,
shape
=
[
11
])
y_grad_arr
.
persistable
=
True
y_grad
=
scope
.
var
(
'Out@GRAD'
)
y_grad_tensor
=
y_grad
.
get_tensor
()
y_grad_tensor
.
set
(
random_grad
,
cpu
)
op
=
block
.
append_op
(
type
=
self
.
op_type
,
inputs
=
{
"X"
:
input_arr
},
outputs
=
{
"Out"
:
y_out
,
"OutIndex"
:
y_out_index
},
attrs
=
self
.
attrs
)
out_grad
=
block
.
create_var
(
name
=
"tmp_lod_tensor_array@GRAD"
,
type
=
core
.
VarDesc
.
VarType
.
LOD_TENSOR_ARRAY
)
out_grad
.
persistable
=
True
grad_op_desc_list
,
op_grad_to_var
=
core
.
get_grad_op_desc
(
op
.
desc
,
set
(),
[])
grad_op_desc
=
grad_op_desc_list
[
0
]
new_op_desc
=
block
.
desc
.
append_op
()
new_op_desc
.
copy_from
(
grad_op_desc
)
for
var_name
in
grad_op_desc
.
output_arg_names
():
block
.
desc
.
var
(
var_name
.
encode
(
"ascii"
))
grad_op_desc
.
infer_var_type
(
block
.
desc
)
grad_op_desc
.
infer_shape
(
block
.
desc
)
for
arg
in
grad_op_desc
.
output_arg_names
():
grad_var
=
block
.
desc
.
find_var
(
arg
.
encode
(
"ascii"
))
grad_var
.
set_dtype
(
core
.
VarDesc
.
VarType
.
FP32
)
fetch_list
=
[]
fetch_list
.
append
(
block
.
var
(
'Out'
))
fetch_list
.
append
(
block
.
var
(
'OutIndex'
))
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
out
=
exe
.
run
(
program
,
fetch_list
=
fetch_list
,
scope
=
scope
)
#print ("index: ", numpy.array(out[1]))
# test forward
tensor_res
=
numpy
.
array
(
out
[
0
])
tensor_res_out_idx
=
numpy
.
array
(
out
[
1
])
tensor_gt
=
numpy
.
array
(
[
0
]
+
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
],
dtype
=
'float32'
)
self
.
assertEqual
(
len
(
tensor_res
),
len
(
tensor_gt
))
self
.
assertEqual
(
len
(
tensor_res_out_idx
),
10
)
for
i
in
range
(
len
(
tensor_res
)):
self
.
assertEqual
(
tensor_res
[
i
],
tensor_gt
[
i
])
for
i
in
range
(
len
(
tensor_res_out_idx
)):
if
i
==
0
:
self
.
assertEqual
(
tensor_res_out_idx
[
i
],
2
)
else
:
self
.
assertEqual
(
tensor_res_out_idx
[
i
],
1
)
# test backward
grad_tensor
=
scope
.
var
(
'tmp_lod_tensor_array@GRAD'
)
grad_tensor_array
=
grad_tensor
.
get_lod_tensor_array
()
self
.
assertEqual
(
10
,
len
(
grad_tensor_array
))
for
i
in
range
(
len
(
grad_tensor_array
)):
if
i
==
0
:
self
.
assertEqual
(
numpy
.
array
(
grad_tensor_array
[
i
])[
0
],
numpy
.
array
(
random_grad
[
i
]))
self
.
assertEqual
(
numpy
.
array
(
grad_tensor_array
[
i
])[
1
],
numpy
.
array
(
random_grad
[
i
+
1
]))
if
i
==
1
:
self
.
assertEqual
(
numpy
.
array
(
grad_tensor_array
[
i
]),
numpy
.
array
(
random_grad
[
i
+
1
]))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
d231e550
...
...
@@ -31,18 +31,17 @@ Steps to transpile pserver:
"""
import
math
import
sys
import
numpy
as
np
import
collections
import
six
import
logging
from
.ps_dispatcher
import
RoundRobin
,
HashName
,
PSDispatcher
from
.ps_dispatcher
import
RoundRobin
,
PSDispatcher
from
..
import
core
,
framework
,
unique_name
from
..framework
import
Program
,
default_main_program
,
\
default_startup_program
,
Block
,
\
Parameter
,
grad_var_name
from
.details
import
*
from
..distribute_lookup_table
import
find_distributed_lookup_table
from
functools
import
reduce
LOOKUP_TABLE_TYPE
=
"lookup_table"
...
...
@@ -292,7 +291,8 @@ class DistributeTranspiler(object):
self
.
optimize_ops
,
self
.
params_grads
=
self
.
_get_optimize_pass
()
ps_dispatcher
=
self
.
config
.
split_method
(
self
.
pserver_endpoints
)
self
.
has_distributed_lookup_table
=
self
.
_has_distributed_lookup_table
()
self
.
table_name
=
find_distributed_lookup_table
(
self
.
origin_program
)
self
.
has_distributed_lookup_table
=
self
.
table_name
!=
None
self
.
param_name_to_grad_name
=
dict
()
self
.
grad_name_to_param_name
=
dict
()
for
param_var
,
grad_var
in
self
.
params_grads
:
...
...
@@ -966,28 +966,6 @@ to transpile() call.")
# ====================== private transpiler functions =====================
def
_has_distributed_lookup_table
(
self
):
# process lookup_table_op
# 1. check all lookup_table_op is distributed
# 2. check all lookup_table_op share the same table.
distributed_lookup_table_ops
=
[]
# support only one distributed_lookup_table now
self
.
table_name
=
None
for
op
in
self
.
origin_program
.
global_block
().
ops
:
if
op
.
type
==
LOOKUP_TABLE_TYPE
:
if
op
.
attr
(
'is_distributed'
)
is
True
:
if
self
.
table_name
is
None
:
self
.
table_name
=
op
.
input
(
"W"
)[
0
]
if
self
.
table_name
!=
op
.
input
(
"W"
)[
0
]:
raise
RuntimeError
(
"all distributed lookup_table_ops"
" should have only one table"
)
distributed_lookup_table_ops
.
append
(
op
)
else
:
if
self
.
table_name
is
not
None
:
assert
op
.
input
(
"W"
)[
0
]
!=
self
.
table_name
return
len
(
distributed_lookup_table_ops
)
>
0
def
_update_dist_lookup_table_vars
(
self
,
param_list
,
grad_list
,
params_grads
):
# TODO(wuyi): put find a way to put dist lookup table stuff all together.
...
...
@@ -1341,7 +1319,6 @@ to transpile() call.")
"""
create a new block to handle save checkpoint.
"""
import
os
pserver_program
.
global_block
().
create_var
(
name
=
"kLookupTablePath"
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录